add: kpir.ls as initial value estimate (alternative to VLP)

wip: kpir_sim,
removed: kpir kronecker (wrong) version, wip: kpir_ls
2022-05-18 18:33:28 +02:00 · 2022-05-18 13:02:01 +02:00 · 2022-05-11 17:26:37 +02:00
13 changed files with 658 additions and 413 deletions
--- a/.gitignore
+++ b/.gitignore
@ -81,8 +81,12 @@ vignettes/*.pdf
 *.RData
 *.Rdata

+# R Profiling
+*.Rprof
+
 # VSCode configuration
 .vscode/
+.lintr

 ## Archives, compressed files/folders
 # Output files from R CMD build
@ -95,3 +99,17 @@ wip/

 # PDFs
 *.pdf
+
+# LaTeX (ignore everything except *.tex and *.bib files)
+**/LaTeX/*
+!**/LaTeX/*.tex
+!**/LaTeX/*.bib
+**/LaTeX/*-blx.bib
+
+mlda_analysis/
+References/
+
+# Images (except images used in LaTeX)
+*.png
+*.svg
+!**/LaTeX/*.png
--- a/LaTeX/main.tex
+++ b/LaTeX/main.tex
@ -49,6 +49,7 @@
 \newcommand{\mat}[1]{\boldsymbol{#1}}
 \newcommand{\ten}[1]{\mathcal{#1}}
 \renewcommand{\vec}{\operatorname{vec}}
+\newcommand{\dist}{\operatorname{dist}}
 \DeclareMathOperator{\kron}{\otimes}    % Kronecker Product
 \DeclareMathOperator{\hada}{\odot}      % Hadamard Product
 \newcommand{\ttm}[1][n]{\times_{#1}}    % n-mode product (Tensor Times Matrix)
@ -62,8 +63,8 @@
 \DeclareMathOperator*{\argmax}{{arg\,max}}
 \newcommand{\D}{\textnormal{D}}
 \renewcommand{\d}{\textnormal{d}}
-\renewcommand{\t}[1]{{{#1}'}}
-\newcommand{\pinv}[1]{{{#1}^{\dagger}}} % `Moore-Penrose pseudoinverse`
+\renewcommand{\t}[1]{{#1^{\prime}}}
+\newcommand{\pinv}[1]{{#1^{\dagger}}} % `Moore-Penrose pseudoinverse`
 \newcommand{\todo}[1]{{\color{red}TODO: #1}}

 % \DeclareFontFamily{U}{mathx}{\hyphenchar\font45}
@ -87,7 +88,7 @@ We start with a brief summary of the used notation.

 \todo{write this}

-Let $\ten{A}$ be a order (rank) $r$ tensor of dimensions $p_1\times ... \times p_r$ and the matrices $\mat{B}_i$ of dimensions $q_i\times p_i$ for $i = 1, ..., r$, then
+Let $\ten{A}$ be a multi-dimensional array of order (rank) $r$ with dimensions $p_1\times ... \times p_r$ and the matrices $\mat{B}_i$ of dimensions $q_i\times p_i$ for $i = 1, ..., r$, then
 \begin{displaymath}
    \ten{A} \ttm[1] \mat{B}_1 \ttm[2] \ldots \ttm[r] \mat{B}_r
        = \ten{A}\times\{ \mat{B}_1, ..., \mat{B}_r \}
@ -110,6 +111,65 @@ Another example

 \todo{continue}

+\section{Tensor Normal Distribution}
+Let $\ten{X}$ be a multi-dimensional array random variable of order (rank) $r$ with dimensions $p_1\times ... \times p_r$ written as
+\begin{displaymath}
+    \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
+\end{displaymath}
+Its density is given by
+\begin{displaymath}
+    f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{p_{-i}}} \Big)^{-1}
+        \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
+\end{displaymath}
+where $p_{\lnot i} = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
+\begin{displaymath}
+    \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
+\end{displaymath}
+with $p = \prod_{i = 1}^r p_i$.
+
+\begin{theorem}[Tensor Normal to Multi-Variate Normal equivalence]
+    For a multi-dimensional random variable $\ten{X}$ of order $r$ with dimensions $p_1\times ..., p_r$. Let $\ten{\mu}$ be the mean of the same order and dimensions as $\ten{X}$ and the mode covariance matrices $\mat{\Delta}_i$ of dimensions $p_i\times p_i$ for $i = 1, ..., n$. Then the tensor normal distribution is equivalent to the multi-variate normal distribution by the relation
+    \begin{displaymath}
+        \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r)
+            \qquad\Leftrightarrow\qquad
+        \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes ...\otimes \mat{\Delta}_1)
+    \end{displaymath}
+    where $p = \prod_{i = 1}^r p_i$.
+\end{theorem}
+\begin{proof}
+    A straight forward way is to rewrite the Tensor Normal density as the density of a Multi-Variate Normal distribution depending on the vectorization of $\ten{X}$. First consider
+    \begin{align*}
+        \langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle
+            &= \t{\vec(\ten{X} - \mu)}\vec((\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\}) \\
+            &= \t{\vec(\ten{X} - \mu)}(\mat{\Delta}_r^{-1}\otimes ...\otimes\mat{\Delta}_1^{-1})\vec(\ten{X} - \mu) \\
+            &= \t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu).
+    \end{align*}
+    Next, using a property of the determinant of a Kronecker product $|\mat{\Delta}_1\otimes\mat{\Delta}_2| = |\mat{\Delta}_1|^{p_2}|\mat{\Delta}_2|^{p_1}$ yields
+    \begin{displaymath}
+        |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
+            = |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_2|^{p_1}|\mat{\Delta}_1|^{p_{\lnot 1}}
+    \end{displaymath}
+    where $p_{\lnot i} = \prod_{j \neq i}p_j$. By induction over $r$ the relation
+    \begin{displaymath}
+        |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
+            = \prod_{i = 1}^r |\mat{\Delta}_i|^{p_{\lnot i}}
+    \end{displaymath}
+    holds for arbitrary order $r$. Substituting into the Tensor Normal density leads to
+    \begin{align*}
+        f(\ten{X}) = \Big( (2\pi)^p |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1| \Big)^{-1/2}
+        \exp\!\left( -\frac{1}{2}\t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu) \right)
+    \end{align*}
+    which is the Multi-Variate Normal density of the $p$ dimensional vector $\vec\ten{X}$.
+\end{proof}
+
+When sampling from the Multi-Array Normal one way is to sample from the Multi-Variate Normal and then reshaping the result, but this is usually very inefficient because it requires to store the multi-variate covariance matrix which is very big. Instead, it is more efficient to sample $\ten{Z}$ as a tensor of the same shape as $\ten{X}$ with standard normal entries and then transform the $\ten{Z}$ to follow the Multi-Array Normal as follows
+\begin{displaymath}
+    \ten{Z}\sim\mathcal{TN}(0, \mat{I}_{p_1}, ..., \mat{I}_{p_r})
+        \quad\Rightarrow\quad
+    \ten{X} = \ten{Z}\times\{\mat{\Delta}_1^{1/2}, ..., \mat{\Delta}_r^{1/2}\} + \mu\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
+\end{displaymath}
+where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal.
+

 \section{Introduction}
 We assume the model
@ -332,7 +392,7 @@ The unbiasednes comes directly from the following short computation;
        = \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p (\mat{\Delta}_{2})_{l,l}(\mat{\Delta}_{1})_{j,k}
        = (\mat\Delta_1\tr(\mat\Delta_2))_{j,k}.
 \end{displaymath}
-which means that $\E\widetilde{\mat\Delta}_1 = \mat\Delta_1\tr(\mat\Delta_2)$ and in analogy $\E\widetilde{\mat\Delta}_2 = \mat\Delta_2\tr(\mat\Delta_1)$. Now, we need to handle the scaling which can be estimated unbiased by
+which means that $\E\widetilde{\mat\Delta}_1 = \mat\Delta_1\tr(\mat\Delta_2)$ and in analogy $\E\widetilde{\mat\Delta}_2 = \mat\Delta_2\tr(\mat\Delta_1)$. Now, we need to handle the scaling which can be estimated unbiasedly by
 \begin{displaymath}
    \tilde{s} = \frac{1}{n}\sum_{i = 1}^n \|\mat{R}_i\|_F^2
 \end{displaymath}
@ -428,7 +488,7 @@ Putting it all together
       &\hspace{3em}\hspace{3em} \hspace{4.7em} - \tilde{s}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1} \t{\mat{R}_i} \widetilde{\mat{\Delta}}_2^{-1})
       \Big)\d\mat{\beta}\bigg).
 \end{align*}
-Observe that the bracketed expressions before $\d\mat{\alpha}$ and $\d\mat{\beta}$ are transposed of each other. Lets denote the expression for $\d\mat{\alpha}$ as $\mat{G}_i$ which has the form
+Observe that the bracketed expressions before $\d\mat{\alpha}$ and $\d\mat{\beta}$ are transposes. Lets denote the expression for $\d\mat{\alpha}$ as $\mat{G}_i$ which has the form
 \begin{displaymath}
    \mat{G}_i
        = (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\mat{R}_i
@ -468,7 +528,7 @@ Let $\ten{X}, \ten{F}$ be order (rank) $r$ tensors of dimensions $p_1\times ...
 \begin{displaymath}
    \ten{X}_i = \ten{\mu} + \ten{F}_i\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}_i
 \end{displaymath}
-where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}\sim\mathcal{TM}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations
+where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations
 \begin{displaymath}
    \ten{X} = \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon}
 \end{displaymath}
@ -476,48 +536,87 @@ which is almost identical as the observations $\ten{X}_i, \ten{F}_i$ are stacked

 In the following we assume w.l.o.g that $\ten{\mu} = 0$, as if this is not true we simply replace $\ten{X}_i$ with $\ten{X}_i - \ten{\mu}$ for $i = 1, ..., n$ before collecting all the observations in the response tensor $\ten{X}$.

-The goal here is to find reasonable estimates for $\mat{\alpha}_i$, $i = 1, ..., n$ for the mean model
+The goal here is to find reasonable estimates for $\mat{\alpha}_j$, $j = 1, ..., n$ for the mean model
 \begin{displaymath}
    \E \ten{X}|\ten{F}, \mat{\alpha}_1, ..., \mat{\alpha}_r = \ten{F}\times\{\mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n\}
-        = \ten{F}\times_{i\in[r]}\mat{\alpha}_i.
+        = \ten{F}\times_{j\in[r]}\mat{\alpha}_j.
 \end{displaymath}
 Under the mean model we have using the general mode product relation $(\ten{A}\times_j\mat{B})_{(j)} = \mat{B}\ten{A}_{(j)}$ we get
 \begin{align*}
    \ten{X}_{(j)}\t{\ten{X}_{(j)}} \overset{\text{SVD}}{=} \mat{U}_j\mat{D}_j\t{\mat{U}_j}
-        = \mat{\alpha}_j(\ten{F}\times_{i\in[r]\backslash j}\mat{\alpha}_i)_{(j)}
-          \t{(\ten{F}\times_{i\in[r]\backslash j}\mat{\alpha}_i)_{(j)}}\t{\mat{\alpha}_j}
+        = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}
+          \t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}\t{\mat{\alpha}_j}
 \end{align*}
 for the $j = 1, ..., r$ modes. Using this relation we construct an iterative estimation process by setting the initial estimates of $\hat{\mat{\alpha}}_j^{(0)} = \mat{U}_j[, 1:q_j]$ which are the first $q_j$ columns of $\mat{U}_j$.

-\todo{continue}
+For getting least squares estimates for $\mat{\alpha}_j$, $j = 1, ..., r$ we observe that by matricization of the mean model
+\begin{displaymath}
+    \ten{X}_{(j)} = (\ten{F}\times_{k\in[r]}\mat{\alpha}_k)_{(j)} = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}
+\end{displaymath}
+leads to normal equations for each $\mat{\alpha}_j$, $j = 1, ..., r$
+\begin{displaymath}
+    \ten{X}_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}
+\end{displaymath}
+where the normal equations for $\mat{\alpha}_j$ depend on all the other $\mat{\alpha}_k$. With the initial estimates from above this allows an alternating approach. Index with $t = 1, ...$ the current iteration, then a new estimate $\widehat{\mat{\alpha}}_j^{(t)}$ given the previous estimates $\widehat{\mat{\alpha}}_k^{(t-1)}$, $k = 1, ..., r$ is computed as
+\begin{displaymath}
+    \widehat{\mat{\alpha}}_j^{(t)} =
+        \ten{X}_{(j)}
+        \t{\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}}
+    \left(
+        \big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}
+        \t{\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}}
+    \right)^{-1}
+\end{displaymath}
+for $j = 1, ..., r$ until convergence or a maximum number of iterations is exceeded. The final estimates are the least squares estimates by this procedure.


 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%                            Numerical Examples                            %%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Numerical Examples}
-The first example (which by it self is \emph{not} exemplary) is the estimation with parameters $n = 200$, $p = 11$, $q = 5$, $k = 14$ and $r = 9$. The ``true'' matrices $\mat\alpha$, $\mat\beta$ generated by sampling there elements i.i.d. standard normal like the responses $y$. Then, for each observation, $\mat{f}_y$ is computed as $\sin(s_{i, j} y + o_{i j})$ \todo{ properly describe} to fill the elements of $\mat{f}_y$. Then the $\mat{X}$'s are samples as
-\begin{displaymath}
-    \mat{X} = \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}, \qquad \vec{\mat{\epsilon}} \sim \mathbb{N}_{p q}(\mat{0}, \mat{\Delta})
-\end{displaymath}
-where $\mat{\Delta}_{i j} = 0.5^{|i - j|}$ for $i, j = 1, ..., p q$.
+% The first example (which by it self is \emph{not} exemplary) is the estimation with parameters $n = 200$, $p = 11$, $q = 5$, $k = 14$ and $r = 9$. The ``true'' matrices $\mat\alpha$, $\mat\beta$ generated by sampling there elements i.i.d. standard normal like the responses $y$. Then, for each observation, $\mat{f}_y$ is computed as $\sin(s_{i, j} y + o_{i j})$ \todo{ properly describe} to fill the elements of $\mat{f}_y$. Then the $\mat{X}$'s are samples as
+% \begin{displaymath}
+%     \mat{X} = \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}, \qquad \vec{\mat{\epsilon}} \sim \mathbb{N}_{p q}(\mat{0}, \mat{\Delta})
+% \end{displaymath}
+% where $\mat{\Delta}_{i j} = 0.5^{|i - j|}$ for $i, j = 1, ..., p q$.

-\begin{figure}
+\begin{table}[!ht]
    \centering
-    \includegraphics{loss_Ex01.png}
-\end{figure}
-\begin{figure}
-    \centering
-    \includegraphics{estimates_Ex01.png}
-\end{figure}
-\begin{figure}
-    \centering
-    \includegraphics{Delta_Ex01.png}
-\end{figure}
-\begin{figure}
-    \centering
-    \includegraphics{hist_Ex01.png}
-\end{figure}
+    % see: https://en.wikibooks.org/wiki/LaTeX/Tables
+    \begin{tabular}{ll|r@{ }l *{3}{r@{.}l}}
+          method & init
+            & \multicolumn{2}{c}{loss}
+            & \multicolumn{2}{c}{MSE}
+            & \multicolumn{2}{c}{$\dist(\hat{\mat\alpha}, \mat\alpha)$}
+            & \multicolumn{2}{c}{$\dist(\hat{\mat\beta}, \mat\beta)$}
+            \\ \hline
+          base     &  vlp & -2642&(1594) & 1&82  (2.714) & 0&248 (0.447) & 0&271 (0.458) \\
+          new      &  vlp & -2704&(1452) & 1&78  (2.658) & 0&233 (0.438) & 0&260 (0.448) \\
+          new      &   ls & -3479&  (95) & 0&99  (0.025) & 0&037 (0.017) & 0&035 (0.015) \\
+          momentum &  vlp & -2704&(1452) & 1&78  (2.658) & 0&233 (0.438) & 0&260 (0.448) \\
+          momentum &   ls & -3479&  (95) & 0&99  (0.025) & 0&037 (0.017) & 0&035 (0.015) \\
+          approx   &  vlp &  6819&(1995) & 3&99 (12.256) & 0&267 (0.448) & 0&287 (0.457) \\
+          approx   &   ls &  5457& (163) & 0&99  (0.025) & 0&033 (0.017) & 0&030 (0.012) \\
+    \end{tabular}
+    \caption{Mean (standard deviation) for simulated runs of $20$ repititions for the model $\mat{X} = \mat{\beta}\mat{f}_y\t{\mat{\alpha}}$ of dimensinos $(p, q) = (11, 7)$, $(k, r) = (3, 5)$ with a sample size of $n = 200$. The covariance structure is $\mat{\Delta} = \mat{\Delta}_2\otimes \mat{\Delta}_1$ for $\Delta_i = \text{AR}(\sqrt{0.5})$, $i = 1, 2$. The functions applied to the standard normal response $y$ are $\sin, \cos$ with increasing frequency.}
+\end{table}
+
+% \begin{figure}
+%     \centering
+%     \includegraphics{loss_Ex01.png}
+% \end{figure}
+% \begin{figure}
+%     \centering
+%     \includegraphics{estimates_Ex01.png}
+% \end{figure}
+% \begin{figure}
+%     \centering
+%     \includegraphics{Delta_Ex01.png}
+% \end{figure}
+% \begin{figure}
+%     \centering
+%     \includegraphics{hist_Ex01.png}
+% \end{figure}

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%                              Bib and Index                               %%%
@ -654,41 +753,33 @@ Therefore, we use $\mat{A}, \mat{B}, \mat{X}, \mat{F}, \mat{R}, ...$ for matrice
    \ten{T}\ttm[2]\mat{B}\ &{\widehat{=}}\ \mat{B}\ten{T}_{(2)} & \ten{T}(n, p, q), \ten{T}_{(2)}(p, n q), \mat{B}(q, p)
 \end{align*}

-\section{Matrix Valued Normal Distribution}
-A random variable $\mat{X}$ of dimensions $p\times q$ is \emph{Matrix-Valued Normal Distribution}, denoted 
-\begin{displaymath}
-    \mat{X}\sim\mathcal{MN}_{p\times q}(\mat{\mu}, \mat{\Delta}_2, \mat{\Delta}_1),
-\end{displaymath}
-if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otimes\mat\Delta_2)$. Note the order of the covariance matrices $\mat\Delta_1, \mat\Delta_2$. Its density is given by
-\begin{displaymath}
-    f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right).
-\end{displaymath}
+% \section{Matrix Valued Normal Distribution}
+% A random variable $\mat{X}$ of dimensions $p\times q$ is \emph{Matrix-Valued Normal Distribution}, denoted 
+% \begin{displaymath}
+%     \mat{X}\sim\mathcal{MN}_{p\times q}(\mat{\mu}, \mat{\Delta}_2, \mat{\Delta}_1),
+% \end{displaymath}
+% if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otimes\mat\Delta_2)$. Note the order of the covariance matrices $\mat\Delta_1, \mat\Delta_2$. Its density is given by
+% \begin{displaymath}
+%     f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right).
+% \end{displaymath}

-\section{Sampling form a Multi-Array Normal Distribution}
-Let $\ten{X}$ be an order (rank) $r$ Multi-Array random variable of dimensions $p_1\times...\times p_r$ following a Multi-Array (or Tensor) Normal distributed
-\begin{displaymath}
-    \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
-\end{displaymath}
-Its density is given by
-\begin{displaymath}
-    f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1}
-        \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
-\end{displaymath}
-with $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
-\begin{displaymath}
-    \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
-\end{displaymath}
-with $p = \prod_{i = 1}^r p_i$.
+% \section{Sampling form a Multi-Array Normal Distribution}
+% Let $\ten{X}$ be an order (rank) $r$ Multi-Array random variable of dimensions $p_1\times...\times p_r$ following a Multi-Array (or Tensor) Normal distributed
+% \begin{displaymath}
+%     \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
+% \end{displaymath}
+% Its density is given by
+% \begin{displaymath}
+%     f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1}
+%         \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
+% \end{displaymath}
+% with $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
+% \begin{displaymath}
+%     \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
+% \end{displaymath}
+% with $p = \prod_{i = 1}^r p_i$.

-When sampling from the Multi-Array Normal one way is to sample from the Multi-Variate Normal and then reshaping the result, but this is usually very inefficient because it requires to store the multi-variate covariance matrix which is very big. Instead, it is more efficient to sample $\ten{Z}$ as a tensor of the same shape as $\ten{X}$ with standard normal entries and then transform the $\ten{Z}$ to follow the Multi-Array Normal as follows
-\begin{displaymath}
-    \ten{Z}\sim\mathcal{TN}(0, \mat{I}_{p_1}, ..., \mat{I}_{p_r})
-        \quad\Rightarrow\quad
-    \ten{X} = \ten{Z}\times\{\mat{\Delta}_1^{1/2}, ..., \mat{\Delta}_r^{1/2}\} + \mu\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
-\end{displaymath}
-where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal.
-
-\todo{Check this!!!}
+% \todo{Check this!!!}

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%                           Reference Summaries                            %%%
--- a/simulations/kpir_sim.R
+++ b/simulations/kpir_sim.R
@ -13,21 +13,32 @@ log.prog <- function(max.iter) {


 ###          Exec all methods for a given data set and collect logs          ###
-sim <- function(X, Fy, shape, alpha.true, beta.true, max.iter = 500L) {
+sim <- function(X, Fy, alpha.true, beta.true, max.iter = 500L) {

    # Logger creator
    logger <- function(name) {
        eval(substitute(function(iter, loss, alpha, beta, ...) {
-            hist[iter + 1L, ] <<- c(
-                iter = iter,
-                loss = loss,
-                dist = (dist <- dist.subspace(c(kronecker(alpha.true, beta.true)),
-                                              c(kronecker(alpha, beta)))),
-                dist.alpha = (dist.alpha <- dist.subspace(c(alpha.true), c(alpha))),
-                dist.beta  = (dist.beta  <- dist.subspace(c( beta.true), c(beta ))),
-                norm.alpha = norm(alpha, "F"),
-                norm.beta  = norm(beta, "F")
-            )
+            tryCatch({
+                hist[iter + 1L, ] <<- c(
+                    iter = iter,
+                    loss = loss,
+                    dist = (dist <- dist.subspace(c(kronecker(alpha.true, beta.true)),
+                                                c(kronecker(alpha, beta)))),
+                    dist.alpha = (dist.alpha <- dist.subspace(c(alpha.true), c(alpha))),
+                    dist.beta  = (dist.beta  <- dist.subspace(c( beta.true), c(beta ))),
+                    norm.alpha = norm(alpha, "F"),
+                    norm.beta  = norm(beta, "F"),
+                    mse = mean((X - mlm(Fy, alpha, beta, modes = 3:2))^2)
+            )},
+            error = function(e) {
+                cat("Error in ", name,
+                    ", dim(alpha): ", dim(alpha),
+                    ", dim(alpha.true): ", dim(alpha.true),
+                    ", dim(beta)", dim(beta),
+                    ", dim(beta.true)", dim(beta.true),
+                    "\n")
+                stop(e)
+            })

           cat(sprintf(
                "%s(%3d) | l = %-12.4f - dist = %-.4e - alpha(%d, %d) = %-.4e - beta(%d, %d) = %-.4e\n",
@ -39,62 +50,94 @@ sim <- function(X, Fy, shape, alpha.true, beta.true, max.iter = 500L) {
    }

    # Initialize logger history targets
-    hist.base <- hist.new <- hist.momentum <- hist.approx <- # hist.kron <-
+    hist.base <-
+    hist.new.vlp <- hist.new.ls <-
+    hist.ls <-
+    hist.momentum.vlp <- hist.momentum.ls <-
+    hist.approx.vlp <- hist.approx.ls <-
        data.frame(iter = seq(0L, max.iter),
            loss = NA, dist = NA, dist.alpha = NA, dist.beta = NA,
-            norm.alpha = NA, norm.beta = NA
+            norm.alpha = NA, norm.beta = NA, mse = NA
        )

    # Base (old)
-    kpir.base(X, Fy, shape, max.iter = max.iter, logger = logger("base"))
+    kpir.base(X, Fy, max.iter = max.iter, logger = logger("base"))

-    # New (simple Gradient Descent)
-    kpir.new(X, Fy, shape, max.iter = max.iter, logger = logger("new"))
+    # New (simple Gradient Descent, using VLP initialization)
+    kpir.new(X, Fy, max.iter = max.iter, init.method = "vlp",
+        logger = logger("new.vlp"))
+    kpir.new(X, Fy, max.iter = max.iter, init.method = "ls",
+        logger = logger("new.ls"))
+
+    # Least Squares estimate (alternating estimation)
+    kpir.ls(X, Fy, sample.mode = 1L, max.iter = max.iter, logger = logger("ls"))

    # Gradient Descent with Nesterov Momentum
-    kpir.momentum(X, Fy, shape, max.iter = max.iter, logger = logger("momentum"))
-
-    # # Residual Covariance Kronecker product assumpton version
-    # kpir.kron(X, Fy, shape, max.iter = max.iter, logger = logger("kron"))
+    kpir.momentum(X, Fy, max.iter = max.iter, init.method = "vlp",
+        logger = logger("momentum.vlp"))
+    kpir.momentum(X, Fy, max.iter = max.iter, init.method = "ls",
+        logger = logger("momentum.ls"))

    # Approximated MLE with Nesterov Momentum
-    kpir.approx(X, Fy, shape, max.iter = max.iter, logger = logger("approx"))
+    kpir.approx(X, Fy, max.iter = max.iter, init.method = "vlp",
+        logger = logger("approx.vlp"))
+    kpir.approx(X, Fy, max.iter = max.iter, init.method = "ls",
+        logger = logger("approx.ls"))

    # Add method tags
-    hist.base$method     <- factor("base")
-    hist.new$method      <- factor("new")
-    hist.momentum$method <- factor("momentum")
-    # hist.kron$method     <- factor("kron")
-    hist.approx$method   <- factor("approx")
+    hist.base$method         <- factor("base")
+    hist.new.vlp$method      <- factor("new")
+    hist.new.ls$method       <- factor("new")
+    hist.ls$method           <- factor("ls")
+    hist.momentum.vlp$method <- factor("momentum")
+    hist.momentum.ls$method  <- factor("momentum")
+    hist.approx.vlp$method   <- factor("approx")
+    hist.approx.ls$method    <- factor("approx")
+    # Add init. method tag
+    hist.base$init         <- factor("vlp")
+    hist.new.vlp$init      <- factor("vlp")
+    hist.new.ls$init       <- factor("ls")
+    hist.ls$init           <- factor("ls")
+    hist.momentum.vlp$init <- factor("vlp")
+    hist.momentum.ls$init  <- factor("ls")
+    hist.approx.vlp$init   <- factor("vlp")
+    hist.approx.ls$init    <- factor("ls")

    # Combine results and return
-    rbind(hist.base, hist.new, hist.momentum, hist.approx) #, hist.kron
+    rbind(
+        hist.base,
+        hist.new.vlp, hist.new.ls,
+        hist.ls,
+        hist.momentum.vlp, hist.momentum.ls,
+        hist.approx.vlp, hist.approx.ls
+    )
 }

 ## Plot helper function
 plot.hist2 <- function(hist, response, type = "all", ...) {
    # Extract final results from history
-    sub <- na.omit(hist[c("iter", response, "method", "repetition")])
-    sub <- aggregate(sub, list(sub$method, sub$repetition), tail, 1)
+    sub <- na.omit(hist[c("iter", response, "method", "init", "repetition")])
+    sub <- aggregate(sub, list(sub$method, sub$init, sub$repetition), tail, 1)

    # Setup ggplot
    p <- ggplot(hist, aes_(x = quote(iter),
                           y = as.name(response),
                           color = quote(method),
-                           group = quote(interaction(method, repetition))))
+                           linetype = quote(init),
+                           group = quote(interaction(method, repetition, init))))
    # Add requested layers
    if (type == "all") {
        p <- p + geom_line(na.rm = TRUE)
        p <- p + geom_point(data = sub)
    } else if (type == "mean") {
-        p <- p + geom_line(alpha = 0.5, na.rm = TRUE, linetype = "dotted")
-        p <- p + geom_point(data = sub, alpha = 0.5)
-        p <- p + geom_line(aes(group = method),
+        p <- p + geom_line(alpha = 0.4, na.rm = TRUE, linetype = "dotted")
+        p <- p + geom_point(data = sub, alpha = 0.4)
+        p <- p + geom_line(aes(group = interaction(method, init)),
                           stat = "summary", fun = "mean", na.rm = TRUE)
    } else if (type == "median") {
-        p <- p + geom_line(alpha = 0.5, na.rm = TRUE, linetype = "dotted")
-        p <- p + geom_point(data = sub, alpha = 0.5)
-        p <- p + geom_line(aes(group = method),
+        p <- p + geom_line(alpha = 0.4, na.rm = TRUE, linetype = "dotted")
+        p <- p + geom_point(data = sub, alpha = 0.4)
+        p <- p + geom_line(aes(group = interaction(method, init)),
                           stat = "summary", fun = "median", na.rm = TRUE)
    }
    # return with theme and annotations 
@ -107,10 +150,10 @@ plot.hist2 <- function(hist, response, type = "all", ...) {

 ## Generate some test data / DEBUG
 n <- 200    # Sample Size
-p <- sample(1:15, 1) # 11
-q <- sample(1:15, 1) # 3
-k <- sample(1:15, 1) # 7
-r <- sample(1:15, 1) # 5
+p <- sample(2:15, 1)                # 11
+q <- sample(2:15, 1)                # 7
+k <- min(sample(1:15, 1), p - 1)    # 3
+r <- min(sample(1:15, 1), q - 1)    # 5
 print(c(n, p, q, k, r))

 hist <- NULL
@ -129,8 +172,10 @@ for (rep in 1:reps) {
    ))
    Delta <- 0.5^abs(outer(seq_len(p * q), seq_len(p * q), `-`))
    X <- tcrossprod(Fy, kronecker(alpha, beta)) + CVarE:::rmvnorm(n, sigma = Delta)
+    dim(X) <- c(n, p, q)
+    dim(Fy) <- c(n, k, r)

-    hist.sim <- sim(X, Fy, shape = c(p, q, k, r), alpha.true, beta.true)
+    hist.sim <- sim(X, Fy, alpha.true, beta.true)
    hist.sim$repetition <- rep

    hist <- rbind(hist, hist.sim)
@ -167,22 +212,22 @@ for (response in c("loss", "dist", "dist.alpha", "dist.beta")) {

 n <- 200    # Sample Size
 p <- 11 # sample(1:15, 1)
-q <- 3  # sample(1:15, 1)
-k <- 7  # sample(1:15, 1)
+q <- 7  # sample(1:15, 1)
+k <- 3  # sample(1:15, 1)
 r <- 5  # sample(1:15, 1)
 print(c(n, p, q, k, r))

 hist <- NULL
 reps <- 20
+max.iter <- 2

-Delta.1 <- sqrt(0.5)^abs(outer(seq_len(q), seq_len(q), `-`))
-Delta.2 <- sqrt(0.5)^abs(outer(seq_len(p), seq_len(p), `-`))
-Delta <- kronecker(Delta.1, Delta.2)
+Delta.1 <- sqrt(0.5)^abs(outer(seq_len(p), seq_len(p), `-`))
+Delta.2 <- sqrt(0.5)^abs(outer(seq_len(q), seq_len(q), `-`))
 for (rep in 1:reps) {
-    cat(sprintf("%4d / %d simulation rep. started\n", rep, reps))
+    cat(sprintf("\n\033[1m%4d / %d simulation rep. started\033[0m\n", rep, reps))

-    alpha.true <- alpha <- matrix(rnorm(q * r), q, r)
-    beta.true <- beta <- matrix(rnorm(p * k), p, k)
+    alpha.1.true <- alpha.1 <- matrix(rnorm(q * r), q, r)
+    alpha.2.true <- alpha.2 <- matrix(rnorm(p * k), p, k)
    y <- rnorm(n)
    Fy <- do.call(cbind, Map(function(slope, offset) {
            sin(slope * y + offset)
@ -190,15 +235,16 @@ for (rep in 1:reps) {
        head(rep(seq(1, ceiling(0.5 * k * r)), each = 2), k * r),
        head(rep(c(0, pi / 2), ceiling(0.5 * k * r)), k * r)
    ))
-    X <- tcrossprod(Fy, kronecker(alpha, beta)) + CVarE:::rmvnorm(n, sigma = Delta)
+    dim(Fy) <- c(n, k, r)
+    X <- mlm(Fy, alpha.1, alpha.2, modes = 3:2)
+    X <- X + rtensornorm(n, 0, Delta.1, Delta.2, sample.mode = 1L)

-    hist.sim <- sim(X, Fy, shape = c(p, q, k, r), alpha.true, beta.true)
+    hist.sim <- sim(X, Fy, alpha.1.true, alpha.2.true, max.iter = max.iter)
    hist.sim$repetition <- rep

    hist <- rbind(hist, hist.sim)
 }

-
 # Save simulation results
 sim.name <- "sim02"
 datetime <- format(Sys.time(), "%Y%m%dT%H%M")
@ -207,14 +253,107 @@ saveRDS(hist, file = sprintf("%s_%s.rds", sim.name, datetime))
 # for GGPlot2, as factors for grouping
 hist$repetition <- factor(hist$repetition)

-for (response in c("loss", "dist", "dist.alpha", "dist.beta")) {
+for (response in c("loss", "mse", "dist", "dist.alpha", "dist.beta")) {
    for (fun in c("all", "mean", "median")) {
-        print(plot.hist2(hist, response, fun, title = fun) + coord_trans(x = "log1p"))
+        title <- paste(fun, paste(c("n", "p", "q", "k", "r"), c(n, p, q, k, r), sep = "=", collapse = ", "))
+        print(plot.hist2(hist, response, fun, title = title) +
+            coord_trans(x = "log1p"))
        dev.print(png, file = sprintf("%s_%s_%s_%s.png", sim.name, datetime, response, fun),
                  width = 768, height = 768, res = 125)
+        if (response != "loss") {
+            print(plot.hist2(hist, response, fun, title = title) +
+                coord_trans(x = "log1p", y = "log1p"))
+            dev.print(png, file = sprintf("%s_%s_%s_%s_log.png", sim.name, datetime, response, fun),
+                      width = 768, height = 768, res = 125)
+        }
    }
 }

+stats <- local({
+    # final result from history
+    sub <- na.omit(hist)
+    sub <- aggregate(sub, list(
+        method = sub$method, init = sub$init, repetition = sub$repetition
+    ), tail, 1)
+
+    # aggregate statistics over repetitions
+    stats.mean <- aggregate(subset(sub, select = c("loss", "mse", "dist.alpha", "dist.beta")),
+        list(method = sub$method, init = sub$init), mean)
+    stats.sd <- aggregate(subset(sub, select = c("loss", "mse", "dist.alpha", "dist.beta")),
+        list(method = sub$method, init = sub$init), sd)
+
+    # merge mean and sd stats together
+    merge(stats.mean, stats.sd, by = c("method", "init"), suffixes = c(".mean", ".sd"))
+})
+print(stats, digits = 2)
+#     method init loss.mean mse.mean dist.alpha.mean dist.beta.mean loss.sd mse.sd dist.alpha.sd dist.beta.sd
+# 1   approx   ls      5457     0.99           0.033          0.030     163  0.025         0.017        0.012
+# 2   approx  vlp      6819     3.99           0.267          0.287    1995 12.256         0.448        0.457
+# 3     base  vlp     -2642     1.82           0.248          0.271    1594  2.714         0.447        0.458
+# 4 momentum   ls     -3479     0.99           0.037          0.035      95  0.025         0.017        0.015
+# 5 momentum  vlp     -2704     1.78           0.233          0.260    1452  2.658         0.438        0.448
+# 6      new   ls     -3479     0.99           0.037          0.035      95  0.025         0.017        0.015
+# 7      new  vlp     -2704     1.78           0.233          0.260    1452  2.658         0.438        0.448
+
+
+################################################################################
+###                                  Sim 3                                   ###
+################################################################################
+n <- 200
+p <- c(7, 11, 5)    # response dimensions   (order 3)
+q <- c(3, 6, 2)     # predictor dimensions  (order 3)
+
+# currently only kpir.ls suppoert higher orders (order > 2)
+sim3 <- function(X, Fy, alphas.true, max.iter = 500L) {
+
+    # Logger creator
+    logger <- function(name) {
+        eval(substitute(function(iter, loss, alpha, beta, ...) {
+            hist[iter + 1L, ] <<- c(
+                iter = iter,
+                loss = loss,
+                mse = (mse <- mean((X - mlm(Fy, alpha, beta, modes = 3:2))^2)),
+                (dist <- unlist(Map(dist.subspace, alphas, alphas.true)))
+            )
+
+           cat(sprintf(
+                "%s(%3d) | loss: %-12.4f - mse: %-12.4f - sum(dist): %-.4e\n",
+                name, iter, loss, sum(dist)
+            ))
+        }, list(hist = as.symbol(paste0("hist.", name)))))
+    }
+
+    # Initialize logger history targets
+    hist.ls <-
+        do.call(data.frame, c(list(
+            iter = seq(0, r), loss = NA, mse = NA),
+            dist = rep(NA, length(dim(X)) - 1L)
+        ))
+
+    # Approximated MLE with Nesterov Momentum
+    kpir.ls(X, Fy, sample.mode = 1L, max.iter = max.iter, logger = logger("ls"))
+
+    # Add method tags
+    hist.ls$method       <- factor("ls")
+
+    # # Combine results and return
+    # rbind(hist.base, hist.new, hist.momentum, hist.approx, hist.ls)
+    hist.ls
+}
+
+sample.data3 <- function(n, p, q) {
+    stopifnot(length(p) == length(q))
+    stopifnot(all(q <= p))
+
+    Deltas <- Map(function(nrow) {
+        
+    }, p)
+
+    list(X, Fy, alphas, Deltas)
+}
+
+
+
 ################################################################################
 ###                                   WIP                                    ###
 ################################################################################
--- a/simulations/simulation_poi.R
+++ b/simulations/simulation_poi.R
@ -1,3 +1,26 @@
+################################################################################
+###                          Sparce SIR against SIR                          ###
+################################################################################
+devtools::load_all('tensorPredictors/')
+library(dr)
+
+n <- 100
+p <- 10
+
+X <- rmvnorm(n, sigma = 0.5^abs(outer(1:p, 1:p, `-`)))
+y <- rowSums(X[, 1:3]) + rnorm(n, 0.5)
+B <- as.matrix(1:p <= 3) / sqrt(3)
+
+dr.sir <- dr(y ~ X, method = 'sir', numdir = ncol(B))
+B.sir <- dr.sir$evectors[, seq_len(ncol(B)), drop = FALSE]
+
+dist.projection(B, B.sir)
+
+B.poi <- with(GEP(X, y, 'sir'), {
+    POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = TRUE)$vectors
+})
+
+dist.projection(B, B.poi)

 ################################################################################
 ###                 LDA (sparse Linear Discrimina Analysis)                  ###
@ -69,19 +92,19 @@ dataset <- function(nr) {
    list(X = X, y = y, B = qr.Q(qr(B)))
 }

-# # Model 1
-# fit <- with(dataset(1), {
-#     with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C'))
-# })
-# fit <- with(dataset(1), {
-#     with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C'))
-# })
-# fit <- with(dataset(1), {
-#     with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = TRUE))
-# })
-# fit <- with(dataset(1), {
-#     with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C', use.C = TRUE))
-# })
+# Model 1
+fit <- with(dataset(1), {
+    with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C'))
+})
+fit <- with(dataset(1), {
+    with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C'))
+})
+fit <- with(dataset(1), {
+    with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = TRUE))
+})
+fit <- with(dataset(1), {
+    with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C', use.C = TRUE))
+})

 # head(fit$vectors, 10)

@ -90,19 +113,24 @@ nr.reps <- 100
 sim <- replicate(nr.reps, {
    res <- double(0)
    for (model.nr in 1:5) {
-        for (method in c('POI-C', 'FastPOI-C')) {
-            for (use.C in c(FALSE, TRUE)) {
-                dist <- with(dataset(model.nr), {
+        with(dataset(model.nr), {
+            for (method in c('POI-C', 'FastPOI-C')) {
+                for (use.C in c(FALSE, TRUE)) {
                    fit <- with(GEP(X, y, 'lda'), {
                        POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = use.C)
                    })
-                    # dist.subspace(B, fit$vectors, is.ortho = FALSE, normalize = TRUE)
-                    dist.projection(B, fit$vectors)
-                })
-                names(dist) <- paste('M', model.nr, '-', method, '-', use.C)
-                res <- c(res, dist)
+                    dist <- dist.projection(B, fit$vectors)
+                    names(dist) <- paste('M', model.nr, '-', method, '-', use.C)
+                    res <<- c(res, dist)
+                }
            }
-        }
+            fit <- with(GEP(X, y, 'lda'), {
+                solve.gep(lhs, rhs, ncol(B))
+            })
+            dist <- dist.projection(B, fit$vectors)
+            names(dist) <- paste('M', model.nr, '- solve -', use.C)
+            res <<- c(res, dist)
+        })
    }
    cat("Counter", (count <<- count + 1), "/", nr.reps, "\n")
    res
--- a/tensorPredictors/NAMESPACE
+++ b/tensorPredictors/NAMESPACE
@ -15,15 +15,17 @@ export(dist.projection)
 export(dist.subspace)
 export(kpir.approx)
 export(kpir.base)
-export(kpir.kron)
+export(kpir.ls)
 export(kpir.momentum)
 export(kpir.new)
 export(mat)
 export(matpow)
 export(matrixImage)
 export(mcrossprod)
+export(mlm)
 export(reduce)
 export(rowKronecker)
+export(rtensornorm)
 export(tensor_predictor)
 export(ttm)
 import(stats)
--- a/tensorPredictors/R/CISE.R
+++ b/tensorPredictors/R/CISE.R
@ -25,7 +25,7 @@
 #' dataset <- function(name, n = 60, p = 24) {
 #'     name <- toupper(name)
 #'     if (!startsWith('M', name)) { name <- paste0('M', name) }
-#' 
+#'
 #'     if (name %in% c('M1', 'M2', 'M3')) {
 #'         Sigma <- 0.5^abs(outer(1:p, 1:p, `-`))
 #'         X <- rmvnorm(n, sigma = Sigma)
@ -52,7 +52,7 @@
 #'     } else {
 #'         stop('Unknown dataset name.')
 #'     }
-#' 
+#'
 #'     list(X = X, y = y, B = B)
 #' }
 #' # Sample dataset
@ -63,16 +63,16 @@
 #' P.Fy <- Fy %*% solve(crossprod(Fy), t(Fy))
 #' M <- with(ds, crossprod(X, P.Fy %*% X) / nrow(X))   # Sigma Fit
 #' N <- cov(ds$X)                                      # Sigma
-#' 
+#'
 #' fits <- CISE(M, N, d = ncol(ds$B), Theta = log(seq(1, exp(1e-3), len = 1000)))
-#' 
+#'
 #' BIC <- unlist(Map(attr, fits, 'BIC'))
 #' df <- unlist(Map(attr, fits, 'df'))
 #' dist <- unlist(Map(attr, fits, 'dist'))
 #' iter <- unlist(Map(attr, fits, 'iter'))
 #' theta <- unlist(Map(attr, fits, 'theta'))
 #' p.theta <- unlist(Map(function(V) sum(rowSums(V^2) > 1e-9), fits))
-#' 
+#'
 #' par(mfrow = c(2, 2))
 #' plot(theta, BIC, type = 'l')
 #' plot(theta, p.theta, type = 'l')
@ -84,7 +84,9 @@
 #' Reduction and Variable Selection" By Xin Chen, Changliang Zou and
 #' R. Dennis Cook.
 #'
-#' @suggest RSpectra
+#' @note for speed reasons this functions attempts to use
+#'  \code{\link[RSpectra]{eigs_sym}} if \pkg{\link{RSpectra}} is installed,
+#'  otherwise \code{\link{eigen}} is used which might be significantly slower.
 #'
 #' @export
 CISE <- function(M, N, d = 1L, method = "PFC", max.iter = 100L, Theta = NULL,
--- a/tensorPredictors/R/dist_subspace.R
+++ b/tensorPredictors/R/dist_subspace.R
@ -40,7 +40,7 @@ dist.subspace <- function (A, B, is.ortho = FALSE, normalize = FALSE,

    if (normalize) {
        rankSum <- ncol(A) + ncol(B)
-        c <- 1 / sqrt(min(rankSum, 2 * nrow(A) - rankSum))
+        c <- 1 / sqrt(max(1, min(rankSum, 2 * nrow(A) - rankSum)))
    } else {
        c <- sqrt(2)
    }
--- a/tensorPredictors/R/kpir_approx.R
+++ b/tensorPredictors/R/kpir_approx.R
@ -9,6 +9,7 @@
 kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
    max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
    nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)),
+    max.init.iter = 20L, init.method = c("ls", "vlp"),
    eps = .Machine$double.eps,
    logger = NULL
 ) {
@ -17,7 +18,7 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
    stopifnot(nrow(X) == NROW(Fy))
    n <- nrow(X)                        # Number of observations

-    # Get and check predictor dimensions
+    # Check predictor dimensions
    if (length(dim(X)) == 2L) {
        stopifnot(!missing(shape))
        stopifnot(ncol(X) == prod(shape[1:2]))
@ -26,18 +27,16 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
    } else if (length(dim(X)) == 3L) {
        p <- dim(X)[2]
        q <- dim(X)[3]
-        dim(X) <- c(n, p * q)
    } else {
        stop("'X' must be a matrix or 3-tensor")
    }

-    # Get and check response dimensions
+    # Check response dimensions
    if (!is.array(Fy)) {
        Fy <- as.array(Fy)
    }
    if (length(dim(Fy)) == 1L) {
        k <- r <- 1L
-        dim(Fy) <- c(n, 1L)
    } else if (length(dim(Fy)) == 2L) {
        stopifnot(!missing(shape))
        stopifnot(ncol(Fy) == prod(shape[3:4]))
@ -46,34 +45,44 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
    } else if (length(dim(Fy)) == 3L) {
        k <- dim(Fy)[2]
        r <- dim(Fy)[3]
-        dim(Fy) <- c(n, k * r)
    } else {
        stop("'Fy' must be a vector, matrix or 3-tensor")
    }


-    ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
-    # Vectorize
-    dim(Fy) <- c(n, k * r)
-    dim(X)  <- c(n, p * q)
-    # Solve
-    cpFy <- crossprod(Fy)               # TODO: Check/Test and/or replace
-    if (n <= k * r || qr(cpFy)$rank < k * r) {
-        # In case of under-determined system replace the inverse in the normal
-        # equation by the Moore-Penrose Pseudo Inverse
-        B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
-    } else {
-        # Compute OLS estimate by the Normal Equation
-        B <- t(solve(cpFy, crossprod(Fy, X)))
+    ### Step 1: (Approx) Least Squares initial estimate
+    init.method <- match.arg(init.method)
+    if (init.method == "ls") {
+        # De-Vectroize (from now on tensor arithmetics)
+        dim(Fy) <- c(n, k, r)
+        dim(X)  <- c(n, p, q)
+
+        ls <- kpir.ls(X, Fy, max.iter = max.init.iter, sample.mode = 1L, eps = eps)
+        c(beta0, alpha0) %<-% ls$alphas
+    } else { # Van Loan and Pitsianis
+        # Vectorize
+        dim(Fy) <- c(n, k * r)
+        dim(X)  <- c(n, p * q)
+
+        # solution for `X = Fy B' + epsilon`
+        cpFy <- crossprod(Fy)               # TODO: Check/Test and/or replace
+        if (n <= k * r || qr(cpFy)$rank < k * r) {
+            # In case of under-determined system replace the inverse in the normal
+            # equation by the Moore-Penrose Pseudo Inverse
+            B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
+        } else {
+            # Compute OLS estimate by the Normal Equation
+            B <- t(solve(cpFy, crossprod(Fy, X)))
+        }
+
+        # De-Vectroize (from now on tensor arithmetics)
+        dim(Fy) <- c(n, k, r)
+        dim(X)  <- c(n, p, q)
+
+        # Decompose `B = alpha x beta` into `alpha` and `beta`
+        c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
    }

-    # De-Vectroize (from now on tensor arithmetics)
-    dim(Fy) <- c(n, k, r)
-    dim(X)  <- c(n, p, q)
-
-    # Decompose `B = alpha x beta` into `alpha` and `beta`
-    c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
-
    # Compute residuals
    R <- X - (Fy %x_3% alpha0 %x_2% beta0)

--- a/tensorPredictors/R/kpir_kron.R
+++ b/tensorPredictors/R/kpir_kron.R
@ -1,216 +0,0 @@
-#' Gradient Descent Bases Tensor Predictors method with Nesterov Accelerated
-#' Momentum and Kronecker structure assumption for the residual covariance
-#' `Delta = Delta.1 %x% Delta.2` (simple plugin version!)
-#'
-#' @export
-kpir.kron <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
-    max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
-    nesterov.scaling = function(a, t) { 0.5 * (1 + sqrt(1 + (2 * a)^2)) },
-    eps = .Machine$double.eps,
-    logger = NULL
-) {
-
-    # Check if X and Fy have same number of observations
-    stopifnot(nrow(X) == NROW(Fy))
-    n <- nrow(X)                        # Number of observations
-
-    # Get and check predictor dimensions (convert to 3-tensor if needed)
-    if (length(dim(X)) == 2L) {
-        stopifnot(!missing(shape))
-        stopifnot(ncol(X) == prod(shape[1:2]))
-        p <- as.integer(shape[1])       # Predictor "height"
-        q <- as.integer(shape[2])       # Predictor "width"
-    } else if (length(dim(X)) == 3L) {
-        p <- dim(X)[2]
-        q <- dim(X)[3]
-    } else {
-        stop("'X' must be a matrix or 3-tensor")
-    }
-
-    # Get and check response dimensions (and convert to 3-tensor if needed)
-    if (!is.array(Fy)) {
-        Fy <- as.array(Fy)
-    }
-    if (length(dim(Fy)) == 1L) {
-        k <- r <- 1L
-        dim(Fy) <- c(n, 1L, 1L)
-    } else if (length(dim(Fy)) == 2L) {
-        stopifnot(!missing(shape))
-        stopifnot(ncol(Fy) == prod(shape[3:4]))
-        k <- as.integer(shape[3])       # Response functional "height"
-        r <- as.integer(shape[4])       # Response functional "width"
-    } else if (length(dim(Fy)) == 3L) {
-        k <- dim(Fy)[2]
-        r <- dim(Fy)[3]
-    } else {
-        stop("'Fy' must be a vector, matrix or 3-tensor")
-    }
-
-
-    ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
-    # Vectorize
-    dim(Fy) <- c(n, k * r)
-    dim(X)  <- c(n, p * q)
-    # Solve
-    cpFy <- crossprod(Fy)               # TODO: Check/Test and/or replace
-    if (n <= k * r || qr(cpFy)$rank < k * r) {
-        # In case of under-determined system replace the inverse in the normal
-        # equation by the Moore-Penrose Pseudo Inverse
-        B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
-    } else {
-        # Compute OLS estimate by the Normal Equation
-        B <- t(solve(cpFy, crossprod(Fy, X)))
-    }
-
-    # De-Vectroize (from now on tensor arithmetics)
-    dim(Fy) <- c(n, k, r)
-    dim(X)  <- c(n, p, q)
-
-    # Decompose `B = alpha x beta` into `alpha` and `beta`
-    c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
-
-    # Compute residuals
-    resid <- X - (Fy %x_3% alpha0 %x_2% beta0)
-
-    # Covariance estimate
-    Delta.1 <- tcrossprod(mat(resid, 3))
-    Delta.2 <- tcrossprod(mat(resid, 2))
-    tr <- sum(diag(Delta.1))
-    Delta.1 <- Delta.1 / sqrt(n * tr)
-    Delta.2 <- Delta.2 / sqrt(n * tr)
-
-    # Transformed Residuals
-    resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2)
-
-    # Evaluate negative log-likelihood
-    loss <- 0.5 * (n * (p * log(det(Delta.1)) + q * log(det(Delta.2))) +
-        sum(resid.trans * resid))
-
-    # Call history callback (logger) before the first iterate
-    if (is.function(logger)) {
-        logger(0L, loss, alpha0, beta0, Delta.1, Delta.2, NA)
-    }
-
-
-    ### Step 2: MLE with LS solution as starting value
-    a0 <- 0
-    a1 <- 1
-    alpha1 <- alpha0
-    beta1 <- beta0
-
-    # main descent loop
-    no.nesterov <- TRUE
-    break.reason <- NA
-    for (iter in seq_len(max.iter)) {
-        if (no.nesterov) {
-            # without extrapolation as fallback
-            S.alpha <- alpha1
-            S.beta  <- beta1
-        } else {
-            # extrapolation using previous direction
-            S.alpha <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0)
-            S.beta  <-  beta1 + ((a0 - 1) / a1) * ( beta1 -  beta0)
-        }
-
-        # Extrapolated Residuals
-        resid <- X - (Fy %x_3% S.alpha %x_2% S.beta)
-
-        # Covariance Estimates
-        Delta.1 <- tcrossprod(mat(resid, 3))
-        Delta.2 <- tcrossprod(mat(resid, 2))
-        tr <- sum(diag(Delta.1))
-        Delta.1 <- Delta.1 / sqrt(n * tr)
-        Delta.2 <- Delta.2 / sqrt(n * tr)
-
-        # Transform Residuals
-        resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2)
-
-        # Calculate Gradients
-        grad.alpha <- tcrossprod(mat(resid.trans, 3), mat(Fy %x_2% S.beta, 3))
-        grad.beta  <- tcrossprod(mat(resid.trans, 2), mat(Fy %x_3% S.alpha, 2))
-
-        # Backtracking line search (Armijo type)
-        # The `inner.prod` is used in the Armijo break condition but does not
-        # depend on the step size.
-        inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
-
-        # backtracking loop
-        for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) {
-            # Update `alpha` and `beta` (note: add(+), the gradients are already
-            # pointing into the negative slope direction of the loss cause they are
-            # the gradients of the log-likelihood [NOT the negative log-likelihood])
-            alpha.temp <- S.alpha + delta * grad.alpha
-            beta.temp  <- S.beta  + delta * grad.beta
-
-            # Update Residuals, Covariance and transformed Residuals
-            resid <- X - (Fy %x_3% alpha.temp %x_2% beta.temp)
-            Delta.1 <- tcrossprod(mat(resid, 3))
-            Delta.2 <- tcrossprod(mat(resid, 2))
-            tr <- sum(diag(Delta.1))
-            Delta.1 <- Delta.1 / sqrt(n * tr)
-            Delta.2 <- Delta.2 / sqrt(n * tr)
-            resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2)
-
-            # Evaluate negative log-likelihood
-            loss.temp <- 0.5 * (n * (p * log(det(Delta.1)) + q * log(det(Delta.2)))
-                + sum(resid.trans * resid))
-
-            # Armijo line search break condition
-            if (loss.temp <= loss - 0.1 * delta * inner.prod) {
-                break
-            }
-        }
-
-        # Call logger (invoke history callback)
-        if (is.function(logger)) {
-            logger(iter, loss.temp, alpha.temp, beta.temp, Delta.1, Delta.2, delta)
-        }
-
-        # Ensure descent
-        if (loss.temp < loss) {
-            alpha0 <- alpha1
-            alpha1 <- alpha.temp
-            beta0  <- beta1
-            beta1  <- beta.temp
-
-            # check break conditions (in descent case)
-            if (mean(abs(alpha1)) + mean(abs(beta1)) < eps) {
-                break.reason <- "alpha, beta numerically zero"
-                break   # basically, estimates are zero -> stop
-            }
-            if (inner.prod < eps * (p * q + r * k)) {
-                break.reason <- "mean squared gradient is smaller than epsilon"
-                break   # mean squared gradient is smaller than epsilon -> stop
-            }
-            if (abs(loss.temp - loss) < eps) {
-                break.reason <- "decrease is too small (slow)"
-                break   # decrease is too small (slow) -> stop
-            }
-
-            loss  <- loss.temp
-            no.nesterov <- FALSE    # always reset
-        } else if (!no.nesterov) {
-            no.nesterov <- TRUE     # retry without momentum
-            next
-        } else {
-            break.reason <- "failed even without momentum"
-            break       # failed even without momentum -> stop
-        }
-
-        # update momentum scaling
-        a0 <- a1
-        a1 <- nesterov.scaling(a1, iter)
-
-        # Set next iter starting step.size to line searched step size
-        # (while allowing it to encrease)
-        step.size <- 1.618034 * delta
-
-    }
-
-    list(
-        loss = loss,
-        alpha = alpha1, beta = beta1,
-        Delta.1 = Delta.1, Delta.2 = Delta.2,
-        break.reason = break.reason
-    )
-}
--- a/tensorPredictors/R/kpir_ls.R
+++ b/tensorPredictors/R/kpir_ls.R
@ -0,0 +1,77 @@
+#' Per mode (axis) alternating least squares estimate
+#'
+#' @param sample.mode index of the sample mode, a.k.a. observation axis index
+#'
+#' @export
+kpir.ls <- function(X, Fy, max.iter = 20L, sample.mode = 1L,
+    eps = .Machine$double.eps, logger = NULL
+) {
+    # Check if X and Fy have same number of observations
+    if (!is.array(Fy)) {
+        # scalar response case (add new axis of size 1)
+        dim(Fy) <- local({
+            dims <- rep(1, length(dim(X)))
+            dims[sample.mode] <- length(Fy)
+            dims
+        })
+    } else {
+        stopifnot(dim(X)[sample.mode] == dim(Fy)[sample.mode])
+    }
+    # Check dimensions
+    stopifnot(length(dim(X)) == length(dim(Fy)))
+    stopifnot(dim(X)[sample.mode] == dim(Fy)[sample.mode])
+    # and model constraints
+    stopifnot(all(dim(Fy) <= dim(X)))
+
+    # mode index sequence (exclude sample mode, a.k.a. observation axis)
+    modes <- seq_along(dim(X))[-sample.mode]
+
+
+    ### Step 1: initial per mode estimates
+    alphas <- Map(function(mode, ncol) {
+        La.svd(mcrossprod(X, mode = mode), ncol)$u
+    }, modes, dim(Fy)[modes])
+
+    # # Scaling of alpha, such that `tr(alpha_i' alpha_i) = tr(alpha_j' alpha_j)``
+    # # for `i, j = 1, ..., r`.
+    # traces <- unlist(Map(function(alpha) sum(alpha^2)))
+    # alphas <- Map(`*`, prod(traces)^(1 / length(alphas)) / traces, alphas)
+
+    # Call history callback (logger) before the first iteration
+    if (is.function(logger)) { do.call(logger, c(0L, NA, rev(alphas))) }
+
+
+    ### Step 2: iterate per mode (axis) least squares estimates
+    for (iter in seq_len(max.iter)) {
+        # cyclic iterate over modes
+        for (j in seq_along(modes)) {
+            # least squares solution for `alpha_j | alpha_i, i != j`
+            Z <- mlm(Fy, alphas[-j], modes = modes[-j])
+            alphas[[j]] <- t(solve(mcrossprod(Z, mode = modes[j]),
+                tcrossprod(mat(Z, modes[j]), mat(X, modes[j]))))
+            # TODO: alphas[[j]] <- t(solve(mcrossprod(Z, j), mcrossprod(Z, X, j)))
+        }
+
+        # # Scaling of alpha, such that `tr(alpha_i' alpha_i) = tr(alpha_j' alpha_j)``
+        # # for `i, j = 1, ..., r`.
+        # traces <- unlist(Map(function(alpha) sum(alpha^2)))
+        # alphas <- Map(`*`, prod(traces)^(1 / length(alphas)) / traces, alphas)
+
+        # Call logger (invoke history callback)
+        if (is.function(logger)) { do.call(logger, c(iter, NA, rev(alphas))) }
+
+        # TODO: add some kind of break condition
+    }
+
+    ### Step 3: Moment estimates for `Delta_i`
+    # Residuals
+    R <- X - mlm(Fy, alphas, modes = modes)
+    # Moment estimates for `Delta_i`s
+    Deltas <- Map(mcrossprod, list(R), mode = modes)
+    Deltas <- Map(`*`, 1 / dim(X)[sample.mode], Deltas)
+
+    list(
+        alphas = structure(alphas, names = as.character(modes)),
+        Deltas = structure(Deltas, names = as.character(modes))
+    )
+}
--- a/tensorPredictors/R/kpir_momentum.R
+++ b/tensorPredictors/R/kpir_momentum.R
@ -5,6 +5,7 @@
 kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
    max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
    nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)),
+    max.init.iter = 20L, init.method = c("ls", "vlp"),
    eps = .Machine$double.eps,
    logger = NULL
 ) {
@ -48,19 +49,30 @@ kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
    }


-    ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
-    cpFy <- crossprod(Fy)               # TODO: Check/Test and/or replace
-    if (n <= k * r || qr(cpFy)$rank < k * r) {
-        # In case of under-determined system replace the inverse in the normal
-        # equation by the Moore-Penrose Pseudo Inverse
-        B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
-    } else {
-        # Compute OLS estimate by the Normal Equation
-        B <- t(solve(cpFy, crossprod(Fy, X)))
-    }
+    ### Step 1: (Approx) Least Squares initial estimate
+    init.method <- match.arg(init.method)
+    if (init.method == "ls") {
+        dim(X) <- c(n, p, q)
+        dim(Fy) <- c(n, k, r)
+        ls <- kpir.ls(X, Fy, max.iter = max.init.iter, sample.mode = 1L, eps = eps)
+        c(beta0, alpha0) %<-% ls$alphas
+        dim(X) <- c(n, p * q)
+        dim(Fy) <- c(n, k * r)
+    } else { # Van Loan and Pitsianis
+        # solution for `X = Fy B' + epsilon`
+        cpFy <- crossprod(Fy)               # TODO: Check/Test and/or replace
+        if (n <= k * r || qr(cpFy)$rank < k * r) {
+            # In case of under-determined system replace the inverse in the normal
+            # equation by the Moore-Penrose Pseudo Inverse
+            B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
+        } else {
+            # Compute OLS estimate by the Normal Equation
+            B <- t(solve(cpFy, crossprod(Fy, X)))
+        }

-    # Decompose `B = alpha x beta` into `alpha` and `beta`
-    c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
+        # Decompose `B = alpha x beta` into `alpha` and `beta`
+        c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
+    }

    # Compute residuals
    resid <- X - tcrossprod(Fy, kronecker(alpha0, beta0))
--- a/tensorPredictors/R/kpir_new.R
+++ b/tensorPredictors/R/kpir_new.R
@ -3,6 +3,7 @@
 #' @export
 kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
    max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
+    max.init.iter = 20L, init.method = c("ls", "vlp"),
    eps = .Machine$double.eps,
    logger = NULL
 ) {
@ -45,19 +46,30 @@ kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
        stop("'Fy' must be a vector, matrix or 3-tensor")
    }

-    ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
-    cpFy <- crossprod(Fy)               # TODO: Check/Test and/or replace
-    if (n <= k * r || qr(cpFy)$rank < k * r) {
-        # In case of under-determined system replace the inverse in the normal
-        # equation by the Moore-Penrose Pseudo Inverse
-        B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
-    } else {
-        # Compute OLS estimate by the Normal Equation
-        B <- t(solve(cpFy, crossprod(Fy, X)))
-    }
+    ### Step 1: (Approx) Least Squares initial estimate
+    init.method <- match.arg(init.method)
+    if (init.method == "ls") {
+        dim(X) <- c(n, p, q)
+        dim(Fy) <- c(n, k, r)
+        ls <- kpir.ls(X, Fy, max.iter = max.init.iter, sample.mode = 1L, eps = eps)
+        c(beta, alpha) %<-% ls$alphas
+        dim(X) <- c(n, p * q)
+        dim(Fy) <- c(n, k * r)
+    } else { # Van Loan and Pitsianis
+        # solution for `X = Fy B' + epsilon`
+        cpFy <- crossprod(Fy)               # TODO: Check/Test and/or replace
+        if (n <= k * r || qr(cpFy)$rank < k * r) {
+            # In case of under-determined system replace the inverse in the normal
+            # equation by the Moore-Penrose Pseudo Inverse
+            B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
+        } else {
+            # Compute OLS estimate by the Normal Equation
+            B <- t(solve(cpFy, crossprod(Fy, X)))
+        }

-    # Decompose `B = alpha x beta` into `alpha` and `beta`
-    c(alpha, beta) %<-% approx.kronecker(B, c(q, r), c(p, k))
+        # Decompose `B = alpha x beta` into `alpha` and `beta`
+        c(alpha, beta) %<-% approx.kronecker(B, c(q, r), c(p, k))
+    }

    # Compute residuals
    resid <- X - tcrossprod(Fy, kronecker(alpha, beta))
--- a/tensorPredictors/R/mlm.R
+++ b/tensorPredictors/R/mlm.R
@ -0,0 +1,71 @@
+#' Multi Linear Multiplication
+#'
+#'     C = A x { B1, ..., Br }
+#'
+#' @param A tensor (multi-linear array)
+#' @param B matrix or list of matrices
+#' @param ... further matrices, concatenated with \code{B}
+#' @param modes integer sequence of the same length as number of matrices
+#'  supplied (in \code{B} and \code{...})
+#'
+#' @examples
+#' # general usage
+#' dimA <- c(3, 17, 19, 2)
+#' dimC <- c(7, 11, 13, 5)
+#' A <- array(rnorm(prod(dimA)), dim = dimA)
+#' B <- Map(function(p, q) matrix(rnorm(p * q), p, q), dimC, dimA)
+#' C1 <- mlm(A, B)
+#' C2 <- mlm(A, B[[1]], B[[2]], B[[3]], B[[4]])
+#' C3 <- mlm(A, B[[3]], B[[1]], B[[2]], B[[4]], modes = c(3, 1, 2, 4))
+#' C4 <- mlm(A, B[1:3], B[[4]])
+#' stopifnot(all.equal(C1, C2))
+#' stopifnot(all.equal(C1, C3))
+#' stopifnot(all.equal(C1, C4))
+#'
+#' # selected modes
+#' C1 <- mlm(A, B, modes = 2:3)
+#' C2 <- mlm(A, B[[2]], B[[3]], modes = 2:3)
+#' C3 <- ttm(ttm(A, B[[2]], 2), B[[3]], 3)
+#' stopifnot(all.equal(C1, C2))
+#' stopifnot(all.equal(C1, C3))
+#'
+#' # analog to matrix multiplication
+#' A <- matrix(rnorm( 6), 2, 3)
+#' B <- matrix(rnorm(12), 3, 4)
+#' C <- matrix(rnorm(20), 4, 5)
+#' stopifnot(all.equal(
+#'     A %*% B %*% t(C),
+#'     mlm(B, list(A, C))
+#' ))
+#'
+#' # usage with repeated modes (non commutative)
+#' dimA <- c(3, 17, 19, 2)
+#' A <- array(rnorm(prod(dimA)), dim = dimA)
+#' B1 <- matrix(rnorm(9), 3, 3)
+#' B2 <- matrix(rnorm(9), 3, 3)
+#' C <- matrix(rnorm(4), 2, 2)
+#' # same modes do NOT commute
+#' all.equal(
+#'     mlm(A, B1, B2, C, modes = c(1, 1, 4)),  # NOT equal!
+#'     mlm(A, B2, B1, C, modes = c(1, 1, 4))
+#' )
+#' # but different modes do commute
+#' P1 <- mlm(A, C, B1, B2, modes = c(4, 1, 1))
+#' P2 <- mlm(A, B1, C, B2, modes = c(1, 4, 1))
+#' P3 <- mlm(A, B1, B2, C, modes = c(1, 1, 4))
+#' stopifnot(all.equal(P1, P2))
+#' stopifnot(all.equal(P1, P3))
+#'
+#' @export
+mlm <- function(A, B, ..., modes = seq_along(B)) {
+    # Collect all matrices in `B`
+    B <- c(if (is.matrix(B)) list(B) else B, list(...))
+
+    # iteratively apply Tensor Times Matrix multiplication over modes
+    for (i in seq_along(modes)) {
+        A <- ttm(A, B[[i]], modes[i])
+    }
+
+    # return result tensor
+    A
+}
Author	SHA1	Message	Date
daniel	a2963024ef	add: kpir.ls as initial value estimate (alternative to VLP)	2022-05-18 18:33:28 +02:00
daniel	49bf4bdf20	wip: kpir_sim, removed: kpir kronecker (wrong) version, wip: kpir_ls	2022-05-18 13:02:01 +02:00
daniel	7c33cc152f	add: mlm, kpir_ls	2022-05-11 17:26:37 +02:00