Compare commits

..

No commits in common. "a2963024ef6620412e7356a396dc724cc7c96b95" and "36dd08c7c97689d2eb8c5fcacc48c46dc976528c" have entirely different histories.

13 changed files with 413 additions and 658 deletions

18
.gitignore vendored
View File

@ -81,12 +81,8 @@ vignettes/*.pdf
*.RData *.RData
*.Rdata *.Rdata
# R Profiling
*.Rprof
# VSCode configuration # VSCode configuration
.vscode/ .vscode/
.lintr
## Archives, compressed files/folders ## Archives, compressed files/folders
# Output files from R CMD build # Output files from R CMD build
@ -99,17 +95,3 @@ wip/
# PDFs # PDFs
*.pdf *.pdf
# LaTeX (ignore everything except *.tex and *.bib files)
**/LaTeX/*
!**/LaTeX/*.tex
!**/LaTeX/*.bib
**/LaTeX/*-blx.bib
mlda_analysis/
References/
# Images (except images used in LaTeX)
*.png
*.svg
!**/LaTeX/*.png

View File

@ -49,7 +49,6 @@
\newcommand{\mat}[1]{\boldsymbol{#1}} \newcommand{\mat}[1]{\boldsymbol{#1}}
\newcommand{\ten}[1]{\mathcal{#1}} \newcommand{\ten}[1]{\mathcal{#1}}
\renewcommand{\vec}{\operatorname{vec}} \renewcommand{\vec}{\operatorname{vec}}
\newcommand{\dist}{\operatorname{dist}}
\DeclareMathOperator{\kron}{\otimes} % Kronecker Product \DeclareMathOperator{\kron}{\otimes} % Kronecker Product
\DeclareMathOperator{\hada}{\odot} % Hadamard Product \DeclareMathOperator{\hada}{\odot} % Hadamard Product
\newcommand{\ttm}[1][n]{\times_{#1}} % n-mode product (Tensor Times Matrix) \newcommand{\ttm}[1][n]{\times_{#1}} % n-mode product (Tensor Times Matrix)
@ -63,8 +62,8 @@
\DeclareMathOperator*{\argmax}{{arg\,max}} \DeclareMathOperator*{\argmax}{{arg\,max}}
\newcommand{\D}{\textnormal{D}} \newcommand{\D}{\textnormal{D}}
\renewcommand{\d}{\textnormal{d}} \renewcommand{\d}{\textnormal{d}}
\renewcommand{\t}[1]{{#1^{\prime}}} \renewcommand{\t}[1]{{{#1}'}}
\newcommand{\pinv}[1]{{#1^{\dagger}}} % `Moore-Penrose pseudoinverse` \newcommand{\pinv}[1]{{{#1}^{\dagger}}} % `Moore-Penrose pseudoinverse`
\newcommand{\todo}[1]{{\color{red}TODO: #1}} \newcommand{\todo}[1]{{\color{red}TODO: #1}}
% \DeclareFontFamily{U}{mathx}{\hyphenchar\font45} % \DeclareFontFamily{U}{mathx}{\hyphenchar\font45}
@ -88,7 +87,7 @@ We start with a brief summary of the used notation.
\todo{write this} \todo{write this}
Let $\ten{A}$ be a multi-dimensional array of order (rank) $r$ with dimensions $p_1\times ... \times p_r$ and the matrices $\mat{B}_i$ of dimensions $q_i\times p_i$ for $i = 1, ..., r$, then Let $\ten{A}$ be a order (rank) $r$ tensor of dimensions $p_1\times ... \times p_r$ and the matrices $\mat{B}_i$ of dimensions $q_i\times p_i$ for $i = 1, ..., r$, then
\begin{displaymath} \begin{displaymath}
\ten{A} \ttm[1] \mat{B}_1 \ttm[2] \ldots \ttm[r] \mat{B}_r \ten{A} \ttm[1] \mat{B}_1 \ttm[2] \ldots \ttm[r] \mat{B}_r
= \ten{A}\times\{ \mat{B}_1, ..., \mat{B}_r \} = \ten{A}\times\{ \mat{B}_1, ..., \mat{B}_r \}
@ -111,65 +110,6 @@ Another example
\todo{continue} \todo{continue}
\section{Tensor Normal Distribution}
Let $\ten{X}$ be a multi-dimensional array random variable of order (rank) $r$ with dimensions $p_1\times ... \times p_r$ written as
\begin{displaymath}
\ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
\end{displaymath}
Its density is given by
\begin{displaymath}
f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{p_{-i}}} \Big)^{-1}
\exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
\end{displaymath}
where $p_{\lnot i} = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
\begin{displaymath}
\vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
\end{displaymath}
with $p = \prod_{i = 1}^r p_i$.
\begin{theorem}[Tensor Normal to Multi-Variate Normal equivalence]
For a multi-dimensional random variable $\ten{X}$ of order $r$ with dimensions $p_1\times ..., p_r$. Let $\ten{\mu}$ be the mean of the same order and dimensions as $\ten{X}$ and the mode covariance matrices $\mat{\Delta}_i$ of dimensions $p_i\times p_i$ for $i = 1, ..., n$. Then the tensor normal distribution is equivalent to the multi-variate normal distribution by the relation
\begin{displaymath}
\ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r)
\qquad\Leftrightarrow\qquad
\vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes ...\otimes \mat{\Delta}_1)
\end{displaymath}
where $p = \prod_{i = 1}^r p_i$.
\end{theorem}
\begin{proof}
A straight forward way is to rewrite the Tensor Normal density as the density of a Multi-Variate Normal distribution depending on the vectorization of $\ten{X}$. First consider
\begin{align*}
\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle
&= \t{\vec(\ten{X} - \mu)}\vec((\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\}) \\
&= \t{\vec(\ten{X} - \mu)}(\mat{\Delta}_r^{-1}\otimes ...\otimes\mat{\Delta}_1^{-1})\vec(\ten{X} - \mu) \\
&= \t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu).
\end{align*}
Next, using a property of the determinant of a Kronecker product $|\mat{\Delta}_1\otimes\mat{\Delta}_2| = |\mat{\Delta}_1|^{p_2}|\mat{\Delta}_2|^{p_1}$ yields
\begin{displaymath}
|\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
= |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_2|^{p_1}|\mat{\Delta}_1|^{p_{\lnot 1}}
\end{displaymath}
where $p_{\lnot i} = \prod_{j \neq i}p_j$. By induction over $r$ the relation
\begin{displaymath}
|\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
= \prod_{i = 1}^r |\mat{\Delta}_i|^{p_{\lnot i}}
\end{displaymath}
holds for arbitrary order $r$. Substituting into the Tensor Normal density leads to
\begin{align*}
f(\ten{X}) = \Big( (2\pi)^p |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1| \Big)^{-1/2}
\exp\!\left( -\frac{1}{2}\t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu) \right)
\end{align*}
which is the Multi-Variate Normal density of the $p$ dimensional vector $\vec\ten{X}$.
\end{proof}
When sampling from the Multi-Array Normal one way is to sample from the Multi-Variate Normal and then reshaping the result, but this is usually very inefficient because it requires to store the multi-variate covariance matrix which is very big. Instead, it is more efficient to sample $\ten{Z}$ as a tensor of the same shape as $\ten{X}$ with standard normal entries and then transform the $\ten{Z}$ to follow the Multi-Array Normal as follows
\begin{displaymath}
\ten{Z}\sim\mathcal{TN}(0, \mat{I}_{p_1}, ..., \mat{I}_{p_r})
\quad\Rightarrow\quad
\ten{X} = \ten{Z}\times\{\mat{\Delta}_1^{1/2}, ..., \mat{\Delta}_r^{1/2}\} + \mu\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
\end{displaymath}
where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal.
\section{Introduction} \section{Introduction}
We assume the model We assume the model
@ -392,7 +332,7 @@ The unbiasednes comes directly from the following short computation;
= \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p (\mat{\Delta}_{2})_{l,l}(\mat{\Delta}_{1})_{j,k} = \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p (\mat{\Delta}_{2})_{l,l}(\mat{\Delta}_{1})_{j,k}
= (\mat\Delta_1\tr(\mat\Delta_2))_{j,k}. = (\mat\Delta_1\tr(\mat\Delta_2))_{j,k}.
\end{displaymath} \end{displaymath}
which means that $\E\widetilde{\mat\Delta}_1 = \mat\Delta_1\tr(\mat\Delta_2)$ and in analogy $\E\widetilde{\mat\Delta}_2 = \mat\Delta_2\tr(\mat\Delta_1)$. Now, we need to handle the scaling which can be estimated unbiasedly by which means that $\E\widetilde{\mat\Delta}_1 = \mat\Delta_1\tr(\mat\Delta_2)$ and in analogy $\E\widetilde{\mat\Delta}_2 = \mat\Delta_2\tr(\mat\Delta_1)$. Now, we need to handle the scaling which can be estimated unbiased by
\begin{displaymath} \begin{displaymath}
\tilde{s} = \frac{1}{n}\sum_{i = 1}^n \|\mat{R}_i\|_F^2 \tilde{s} = \frac{1}{n}\sum_{i = 1}^n \|\mat{R}_i\|_F^2
\end{displaymath} \end{displaymath}
@ -488,7 +428,7 @@ Putting it all together
&\hspace{3em}\hspace{3em} \hspace{4.7em} - \tilde{s}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1} \t{\mat{R}_i} \widetilde{\mat{\Delta}}_2^{-1}) &\hspace{3em}\hspace{3em} \hspace{4.7em} - \tilde{s}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1} \t{\mat{R}_i} \widetilde{\mat{\Delta}}_2^{-1})
\Big)\d\mat{\beta}\bigg). \Big)\d\mat{\beta}\bigg).
\end{align*} \end{align*}
Observe that the bracketed expressions before $\d\mat{\alpha}$ and $\d\mat{\beta}$ are transposes. Lets denote the expression for $\d\mat{\alpha}$ as $\mat{G}_i$ which has the form Observe that the bracketed expressions before $\d\mat{\alpha}$ and $\d\mat{\beta}$ are transposed of each other. Lets denote the expression for $\d\mat{\alpha}$ as $\mat{G}_i$ which has the form
\begin{displaymath} \begin{displaymath}
\mat{G}_i \mat{G}_i
= (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\mat{R}_i = (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\mat{R}_i
@ -528,7 +468,7 @@ Let $\ten{X}, \ten{F}$ be order (rank) $r$ tensors of dimensions $p_1\times ...
\begin{displaymath} \begin{displaymath}
\ten{X}_i = \ten{\mu} + \ten{F}_i\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}_i \ten{X}_i = \ten{\mu} + \ten{F}_i\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}_i
\end{displaymath} \end{displaymath}
where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}\sim\mathcal{TM}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations
\begin{displaymath} \begin{displaymath}
\ten{X} = \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon} \ten{X} = \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon}
\end{displaymath} \end{displaymath}
@ -536,87 +476,48 @@ which is almost identical as the observations $\ten{X}_i, \ten{F}_i$ are stacked
In the following we assume w.l.o.g that $\ten{\mu} = 0$, as if this is not true we simply replace $\ten{X}_i$ with $\ten{X}_i - \ten{\mu}$ for $i = 1, ..., n$ before collecting all the observations in the response tensor $\ten{X}$. In the following we assume w.l.o.g that $\ten{\mu} = 0$, as if this is not true we simply replace $\ten{X}_i$ with $\ten{X}_i - \ten{\mu}$ for $i = 1, ..., n$ before collecting all the observations in the response tensor $\ten{X}$.
The goal here is to find reasonable estimates for $\mat{\alpha}_j$, $j = 1, ..., n$ for the mean model The goal here is to find reasonable estimates for $\mat{\alpha}_i$, $i = 1, ..., n$ for the mean model
\begin{displaymath} \begin{displaymath}
\E \ten{X}|\ten{F}, \mat{\alpha}_1, ..., \mat{\alpha}_r = \ten{F}\times\{\mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n\} \E \ten{X}|\ten{F}, \mat{\alpha}_1, ..., \mat{\alpha}_r = \ten{F}\times\{\mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n\}
= \ten{F}\times_{j\in[r]}\mat{\alpha}_j. = \ten{F}\times_{i\in[r]}\mat{\alpha}_i.
\end{displaymath} \end{displaymath}
Under the mean model we have using the general mode product relation $(\ten{A}\times_j\mat{B})_{(j)} = \mat{B}\ten{A}_{(j)}$ we get Under the mean model we have using the general mode product relation $(\ten{A}\times_j\mat{B})_{(j)} = \mat{B}\ten{A}_{(j)}$ we get
\begin{align*} \begin{align*}
\ten{X}_{(j)}\t{\ten{X}_{(j)}} \overset{\text{SVD}}{=} \mat{U}_j\mat{D}_j\t{\mat{U}_j} \ten{X}_{(j)}\t{\ten{X}_{(j)}} \overset{\text{SVD}}{=} \mat{U}_j\mat{D}_j\t{\mat{U}_j}
= \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)} = \mat{\alpha}_j(\ten{F}\times_{i\in[r]\backslash j}\mat{\alpha}_i)_{(j)}
\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}\t{\mat{\alpha}_j} \t{(\ten{F}\times_{i\in[r]\backslash j}\mat{\alpha}_i)_{(j)}}\t{\mat{\alpha}_j}
\end{align*} \end{align*}
for the $j = 1, ..., r$ modes. Using this relation we construct an iterative estimation process by setting the initial estimates of $\hat{\mat{\alpha}}_j^{(0)} = \mat{U}_j[, 1:q_j]$ which are the first $q_j$ columns of $\mat{U}_j$. for the $j = 1, ..., r$ modes. Using this relation we construct an iterative estimation process by setting the initial estimates of $\hat{\mat{\alpha}}_j^{(0)} = \mat{U}_j[, 1:q_j]$ which are the first $q_j$ columns of $\mat{U}_j$.
For getting least squares estimates for $\mat{\alpha}_j$, $j = 1, ..., r$ we observe that by matricization of the mean model \todo{continue}
\begin{displaymath}
\ten{X}_{(j)} = (\ten{F}\times_{k\in[r]}\mat{\alpha}_k)_{(j)} = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}
\end{displaymath}
leads to normal equations for each $\mat{\alpha}_j$, $j = 1, ..., r$
\begin{displaymath}
\ten{X}_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}
\end{displaymath}
where the normal equations for $\mat{\alpha}_j$ depend on all the other $\mat{\alpha}_k$. With the initial estimates from above this allows an alternating approach. Index with $t = 1, ...$ the current iteration, then a new estimate $\widehat{\mat{\alpha}}_j^{(t)}$ given the previous estimates $\widehat{\mat{\alpha}}_k^{(t-1)}$, $k = 1, ..., r$ is computed as
\begin{displaymath}
\widehat{\mat{\alpha}}_j^{(t)} =
\ten{X}_{(j)}
\t{\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}}
\left(
\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}
\t{\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}}
\right)^{-1}
\end{displaymath}
for $j = 1, ..., r$ until convergence or a maximum number of iterations is exceeded. The final estimates are the least squares estimates by this procedure.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Numerical Examples %%% %%% Numerical Examples %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Numerical Examples} \section{Numerical Examples}
% The first example (which by it self is \emph{not} exemplary) is the estimation with parameters $n = 200$, $p = 11$, $q = 5$, $k = 14$ and $r = 9$. The ``true'' matrices $\mat\alpha$, $\mat\beta$ generated by sampling there elements i.i.d. standard normal like the responses $y$. Then, for each observation, $\mat{f}_y$ is computed as $\sin(s_{i, j} y + o_{i j})$ \todo{ properly describe} to fill the elements of $\mat{f}_y$. Then the $\mat{X}$'s are samples as The first example (which by it self is \emph{not} exemplary) is the estimation with parameters $n = 200$, $p = 11$, $q = 5$, $k = 14$ and $r = 9$. The ``true'' matrices $\mat\alpha$, $\mat\beta$ generated by sampling there elements i.i.d. standard normal like the responses $y$. Then, for each observation, $\mat{f}_y$ is computed as $\sin(s_{i, j} y + o_{i j})$ \todo{ properly describe} to fill the elements of $\mat{f}_y$. Then the $\mat{X}$'s are samples as
% \begin{displaymath} \begin{displaymath}
% \mat{X} = \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}, \qquad \vec{\mat{\epsilon}} \sim \mathbb{N}_{p q}(\mat{0}, \mat{\Delta}) \mat{X} = \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}, \qquad \vec{\mat{\epsilon}} \sim \mathbb{N}_{p q}(\mat{0}, \mat{\Delta})
% \end{displaymath} \end{displaymath}
% where $\mat{\Delta}_{i j} = 0.5^{|i - j|}$ for $i, j = 1, ..., p q$. where $\mat{\Delta}_{i j} = 0.5^{|i - j|}$ for $i, j = 1, ..., p q$.
\begin{table}[!ht] \begin{figure}
\centering \centering
% see: https://en.wikibooks.org/wiki/LaTeX/Tables \includegraphics{loss_Ex01.png}
\begin{tabular}{ll|r@{ }l *{3}{r@{.}l}} \end{figure}
method & init \begin{figure}
& \multicolumn{2}{c}{loss} \centering
& \multicolumn{2}{c}{MSE} \includegraphics{estimates_Ex01.png}
& \multicolumn{2}{c}{$\dist(\hat{\mat\alpha}, \mat\alpha)$} \end{figure}
& \multicolumn{2}{c}{$\dist(\hat{\mat\beta}, \mat\beta)$} \begin{figure}
\\ \hline \centering
base & vlp & -2642&(1594) & 1&82 (2.714) & 0&248 (0.447) & 0&271 (0.458) \\ \includegraphics{Delta_Ex01.png}
new & vlp & -2704&(1452) & 1&78 (2.658) & 0&233 (0.438) & 0&260 (0.448) \\ \end{figure}
new & ls & -3479& (95) & 0&99 (0.025) & 0&037 (0.017) & 0&035 (0.015) \\ \begin{figure}
momentum & vlp & -2704&(1452) & 1&78 (2.658) & 0&233 (0.438) & 0&260 (0.448) \\ \centering
momentum & ls & -3479& (95) & 0&99 (0.025) & 0&037 (0.017) & 0&035 (0.015) \\ \includegraphics{hist_Ex01.png}
approx & vlp & 6819&(1995) & 3&99 (12.256) & 0&267 (0.448) & 0&287 (0.457) \\ \end{figure}
approx & ls & 5457& (163) & 0&99 (0.025) & 0&033 (0.017) & 0&030 (0.012) \\
\end{tabular}
\caption{Mean (standard deviation) for simulated runs of $20$ repititions for the model $\mat{X} = \mat{\beta}\mat{f}_y\t{\mat{\alpha}}$ of dimensinos $(p, q) = (11, 7)$, $(k, r) = (3, 5)$ with a sample size of $n = 200$. The covariance structure is $\mat{\Delta} = \mat{\Delta}_2\otimes \mat{\Delta}_1$ for $\Delta_i = \text{AR}(\sqrt{0.5})$, $i = 1, 2$. The functions applied to the standard normal response $y$ are $\sin, \cos$ with increasing frequency.}
\end{table}
% \begin{figure}
% \centering
% \includegraphics{loss_Ex01.png}
% \end{figure}
% \begin{figure}
% \centering
% \includegraphics{estimates_Ex01.png}
% \end{figure}
% \begin{figure}
% \centering
% \includegraphics{Delta_Ex01.png}
% \end{figure}
% \begin{figure}
% \centering
% \includegraphics{hist_Ex01.png}
% \end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Bib and Index %%% %%% Bib and Index %%%
@ -753,33 +654,41 @@ Therefore, we use $\mat{A}, \mat{B}, \mat{X}, \mat{F}, \mat{R}, ...$ for matrice
\ten{T}\ttm[2]\mat{B}\ &{\widehat{=}}\ \mat{B}\ten{T}_{(2)} & \ten{T}(n, p, q), \ten{T}_{(2)}(p, n q), \mat{B}(q, p) \ten{T}\ttm[2]\mat{B}\ &{\widehat{=}}\ \mat{B}\ten{T}_{(2)} & \ten{T}(n, p, q), \ten{T}_{(2)}(p, n q), \mat{B}(q, p)
\end{align*} \end{align*}
% \section{Matrix Valued Normal Distribution} \section{Matrix Valued Normal Distribution}
% A random variable $\mat{X}$ of dimensions $p\times q$ is \emph{Matrix-Valued Normal Distribution}, denoted A random variable $\mat{X}$ of dimensions $p\times q$ is \emph{Matrix-Valued Normal Distribution}, denoted
% \begin{displaymath} \begin{displaymath}
% \mat{X}\sim\mathcal{MN}_{p\times q}(\mat{\mu}, \mat{\Delta}_2, \mat{\Delta}_1), \mat{X}\sim\mathcal{MN}_{p\times q}(\mat{\mu}, \mat{\Delta}_2, \mat{\Delta}_1),
% \end{displaymath} \end{displaymath}
% if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otimes\mat\Delta_2)$. Note the order of the covariance matrices $\mat\Delta_1, \mat\Delta_2$. Its density is given by if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otimes\mat\Delta_2)$. Note the order of the covariance matrices $\mat\Delta_1, \mat\Delta_2$. Its density is given by
% \begin{displaymath} \begin{displaymath}
% f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right). f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right).
% \end{displaymath} \end{displaymath}
% \section{Sampling form a Multi-Array Normal Distribution} \section{Sampling form a Multi-Array Normal Distribution}
% Let $\ten{X}$ be an order (rank) $r$ Multi-Array random variable of dimensions $p_1\times...\times p_r$ following a Multi-Array (or Tensor) Normal distributed Let $\ten{X}$ be an order (rank) $r$ Multi-Array random variable of dimensions $p_1\times...\times p_r$ following a Multi-Array (or Tensor) Normal distributed
% \begin{displaymath} \begin{displaymath}
% \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r). \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
% \end{displaymath} \end{displaymath}
% Its density is given by Its density is given by
% \begin{displaymath} \begin{displaymath}
% f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1} f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1}
% \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right) \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
% \end{displaymath} \end{displaymath}
% with $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution with $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
% \begin{displaymath} \begin{displaymath}
% \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1) \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
% \end{displaymath} \end{displaymath}
% with $p = \prod_{i = 1}^r p_i$. with $p = \prod_{i = 1}^r p_i$.
% \todo{Check this!!!} When sampling from the Multi-Array Normal one way is to sample from the Multi-Variate Normal and then reshaping the result, but this is usually very inefficient because it requires to store the multi-variate covariance matrix which is very big. Instead, it is more efficient to sample $\ten{Z}$ as a tensor of the same shape as $\ten{X}$ with standard normal entries and then transform the $\ten{Z}$ to follow the Multi-Array Normal as follows
\begin{displaymath}
\ten{Z}\sim\mathcal{TN}(0, \mat{I}_{p_1}, ..., \mat{I}_{p_r})
\quad\Rightarrow\quad
\ten{X} = \ten{Z}\times\{\mat{\Delta}_1^{1/2}, ..., \mat{\Delta}_r^{1/2}\} + \mu\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
\end{displaymath}
where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal.
\todo{Check this!!!}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Reference Summaries %%% %%% Reference Summaries %%%

View File

@ -13,32 +13,21 @@ log.prog <- function(max.iter) {
### Exec all methods for a given data set and collect logs ### ### Exec all methods for a given data set and collect logs ###
sim <- function(X, Fy, alpha.true, beta.true, max.iter = 500L) { sim <- function(X, Fy, shape, alpha.true, beta.true, max.iter = 500L) {
# Logger creator # Logger creator
logger <- function(name) { logger <- function(name) {
eval(substitute(function(iter, loss, alpha, beta, ...) { eval(substitute(function(iter, loss, alpha, beta, ...) {
tryCatch({ hist[iter + 1L, ] <<- c(
hist[iter + 1L, ] <<- c( iter = iter,
iter = iter, loss = loss,
loss = loss, dist = (dist <- dist.subspace(c(kronecker(alpha.true, beta.true)),
dist = (dist <- dist.subspace(c(kronecker(alpha.true, beta.true)), c(kronecker(alpha, beta)))),
c(kronecker(alpha, beta)))), dist.alpha = (dist.alpha <- dist.subspace(c(alpha.true), c(alpha))),
dist.alpha = (dist.alpha <- dist.subspace(c(alpha.true), c(alpha))), dist.beta = (dist.beta <- dist.subspace(c( beta.true), c(beta ))),
dist.beta = (dist.beta <- dist.subspace(c( beta.true), c(beta ))), norm.alpha = norm(alpha, "F"),
norm.alpha = norm(alpha, "F"), norm.beta = norm(beta, "F")
norm.beta = norm(beta, "F"), )
mse = mean((X - mlm(Fy, alpha, beta, modes = 3:2))^2)
)},
error = function(e) {
cat("Error in ", name,
", dim(alpha): ", dim(alpha),
", dim(alpha.true): ", dim(alpha.true),
", dim(beta)", dim(beta),
", dim(beta.true)", dim(beta.true),
"\n")
stop(e)
})
cat(sprintf( cat(sprintf(
"%s(%3d) | l = %-12.4f - dist = %-.4e - alpha(%d, %d) = %-.4e - beta(%d, %d) = %-.4e\n", "%s(%3d) | l = %-12.4f - dist = %-.4e - alpha(%d, %d) = %-.4e - beta(%d, %d) = %-.4e\n",
@ -50,94 +39,62 @@ sim <- function(X, Fy, alpha.true, beta.true, max.iter = 500L) {
} }
# Initialize logger history targets # Initialize logger history targets
hist.base <- hist.base <- hist.new <- hist.momentum <- hist.approx <- # hist.kron <-
hist.new.vlp <- hist.new.ls <-
hist.ls <-
hist.momentum.vlp <- hist.momentum.ls <-
hist.approx.vlp <- hist.approx.ls <-
data.frame(iter = seq(0L, max.iter), data.frame(iter = seq(0L, max.iter),
loss = NA, dist = NA, dist.alpha = NA, dist.beta = NA, loss = NA, dist = NA, dist.alpha = NA, dist.beta = NA,
norm.alpha = NA, norm.beta = NA, mse = NA norm.alpha = NA, norm.beta = NA
) )
# Base (old) # Base (old)
kpir.base(X, Fy, max.iter = max.iter, logger = logger("base")) kpir.base(X, Fy, shape, max.iter = max.iter, logger = logger("base"))
# New (simple Gradient Descent, using VLP initialization) # New (simple Gradient Descent)
kpir.new(X, Fy, max.iter = max.iter, init.method = "vlp", kpir.new(X, Fy, shape, max.iter = max.iter, logger = logger("new"))
logger = logger("new.vlp"))
kpir.new(X, Fy, max.iter = max.iter, init.method = "ls",
logger = logger("new.ls"))
# Least Squares estimate (alternating estimation)
kpir.ls(X, Fy, sample.mode = 1L, max.iter = max.iter, logger = logger("ls"))
# Gradient Descent with Nesterov Momentum # Gradient Descent with Nesterov Momentum
kpir.momentum(X, Fy, max.iter = max.iter, init.method = "vlp", kpir.momentum(X, Fy, shape, max.iter = max.iter, logger = logger("momentum"))
logger = logger("momentum.vlp"))
kpir.momentum(X, Fy, max.iter = max.iter, init.method = "ls", # # Residual Covariance Kronecker product assumpton version
logger = logger("momentum.ls")) # kpir.kron(X, Fy, shape, max.iter = max.iter, logger = logger("kron"))
# Approximated MLE with Nesterov Momentum # Approximated MLE with Nesterov Momentum
kpir.approx(X, Fy, max.iter = max.iter, init.method = "vlp", kpir.approx(X, Fy, shape, max.iter = max.iter, logger = logger("approx"))
logger = logger("approx.vlp"))
kpir.approx(X, Fy, max.iter = max.iter, init.method = "ls",
logger = logger("approx.ls"))
# Add method tags # Add method tags
hist.base$method <- factor("base") hist.base$method <- factor("base")
hist.new.vlp$method <- factor("new") hist.new$method <- factor("new")
hist.new.ls$method <- factor("new") hist.momentum$method <- factor("momentum")
hist.ls$method <- factor("ls") # hist.kron$method <- factor("kron")
hist.momentum.vlp$method <- factor("momentum") hist.approx$method <- factor("approx")
hist.momentum.ls$method <- factor("momentum")
hist.approx.vlp$method <- factor("approx")
hist.approx.ls$method <- factor("approx")
# Add init. method tag
hist.base$init <- factor("vlp")
hist.new.vlp$init <- factor("vlp")
hist.new.ls$init <- factor("ls")
hist.ls$init <- factor("ls")
hist.momentum.vlp$init <- factor("vlp")
hist.momentum.ls$init <- factor("ls")
hist.approx.vlp$init <- factor("vlp")
hist.approx.ls$init <- factor("ls")
# Combine results and return # Combine results and return
rbind( rbind(hist.base, hist.new, hist.momentum, hist.approx) #, hist.kron
hist.base,
hist.new.vlp, hist.new.ls,
hist.ls,
hist.momentum.vlp, hist.momentum.ls,
hist.approx.vlp, hist.approx.ls
)
} }
## Plot helper function ## Plot helper function
plot.hist2 <- function(hist, response, type = "all", ...) { plot.hist2 <- function(hist, response, type = "all", ...) {
# Extract final results from history # Extract final results from history
sub <- na.omit(hist[c("iter", response, "method", "init", "repetition")]) sub <- na.omit(hist[c("iter", response, "method", "repetition")])
sub <- aggregate(sub, list(sub$method, sub$init, sub$repetition), tail, 1) sub <- aggregate(sub, list(sub$method, sub$repetition), tail, 1)
# Setup ggplot # Setup ggplot
p <- ggplot(hist, aes_(x = quote(iter), p <- ggplot(hist, aes_(x = quote(iter),
y = as.name(response), y = as.name(response),
color = quote(method), color = quote(method),
linetype = quote(init), group = quote(interaction(method, repetition))))
group = quote(interaction(method, repetition, init))))
# Add requested layers # Add requested layers
if (type == "all") { if (type == "all") {
p <- p + geom_line(na.rm = TRUE) p <- p + geom_line(na.rm = TRUE)
p <- p + geom_point(data = sub) p <- p + geom_point(data = sub)
} else if (type == "mean") { } else if (type == "mean") {
p <- p + geom_line(alpha = 0.4, na.rm = TRUE, linetype = "dotted") p <- p + geom_line(alpha = 0.5, na.rm = TRUE, linetype = "dotted")
p <- p + geom_point(data = sub, alpha = 0.4) p <- p + geom_point(data = sub, alpha = 0.5)
p <- p + geom_line(aes(group = interaction(method, init)), p <- p + geom_line(aes(group = method),
stat = "summary", fun = "mean", na.rm = TRUE) stat = "summary", fun = "mean", na.rm = TRUE)
} else if (type == "median") { } else if (type == "median") {
p <- p + geom_line(alpha = 0.4, na.rm = TRUE, linetype = "dotted") p <- p + geom_line(alpha = 0.5, na.rm = TRUE, linetype = "dotted")
p <- p + geom_point(data = sub, alpha = 0.4) p <- p + geom_point(data = sub, alpha = 0.5)
p <- p + geom_line(aes(group = interaction(method, init)), p <- p + geom_line(aes(group = method),
stat = "summary", fun = "median", na.rm = TRUE) stat = "summary", fun = "median", na.rm = TRUE)
} }
# return with theme and annotations # return with theme and annotations
@ -150,10 +107,10 @@ plot.hist2 <- function(hist, response, type = "all", ...) {
## Generate some test data / DEBUG ## Generate some test data / DEBUG
n <- 200 # Sample Size n <- 200 # Sample Size
p <- sample(2:15, 1) # 11 p <- sample(1:15, 1) # 11
q <- sample(2:15, 1) # 7 q <- sample(1:15, 1) # 3
k <- min(sample(1:15, 1), p - 1) # 3 k <- sample(1:15, 1) # 7
r <- min(sample(1:15, 1), q - 1) # 5 r <- sample(1:15, 1) # 5
print(c(n, p, q, k, r)) print(c(n, p, q, k, r))
hist <- NULL hist <- NULL
@ -172,10 +129,8 @@ for (rep in 1:reps) {
)) ))
Delta <- 0.5^abs(outer(seq_len(p * q), seq_len(p * q), `-`)) Delta <- 0.5^abs(outer(seq_len(p * q), seq_len(p * q), `-`))
X <- tcrossprod(Fy, kronecker(alpha, beta)) + CVarE:::rmvnorm(n, sigma = Delta) X <- tcrossprod(Fy, kronecker(alpha, beta)) + CVarE:::rmvnorm(n, sigma = Delta)
dim(X) <- c(n, p, q)
dim(Fy) <- c(n, k, r)
hist.sim <- sim(X, Fy, alpha.true, beta.true) hist.sim <- sim(X, Fy, shape = c(p, q, k, r), alpha.true, beta.true)
hist.sim$repetition <- rep hist.sim$repetition <- rep
hist <- rbind(hist, hist.sim) hist <- rbind(hist, hist.sim)
@ -212,22 +167,22 @@ for (response in c("loss", "dist", "dist.alpha", "dist.beta")) {
n <- 200 # Sample Size n <- 200 # Sample Size
p <- 11 # sample(1:15, 1) p <- 11 # sample(1:15, 1)
q <- 7 # sample(1:15, 1) q <- 3 # sample(1:15, 1)
k <- 3 # sample(1:15, 1) k <- 7 # sample(1:15, 1)
r <- 5 # sample(1:15, 1) r <- 5 # sample(1:15, 1)
print(c(n, p, q, k, r)) print(c(n, p, q, k, r))
hist <- NULL hist <- NULL
reps <- 20 reps <- 20
max.iter <- 2
Delta.1 <- sqrt(0.5)^abs(outer(seq_len(p), seq_len(p), `-`)) Delta.1 <- sqrt(0.5)^abs(outer(seq_len(q), seq_len(q), `-`))
Delta.2 <- sqrt(0.5)^abs(outer(seq_len(q), seq_len(q), `-`)) Delta.2 <- sqrt(0.5)^abs(outer(seq_len(p), seq_len(p), `-`))
Delta <- kronecker(Delta.1, Delta.2)
for (rep in 1:reps) { for (rep in 1:reps) {
cat(sprintf("\n\033[1m%4d / %d simulation rep. started\033[0m\n", rep, reps)) cat(sprintf("%4d / %d simulation rep. started\n", rep, reps))
alpha.1.true <- alpha.1 <- matrix(rnorm(q * r), q, r) alpha.true <- alpha <- matrix(rnorm(q * r), q, r)
alpha.2.true <- alpha.2 <- matrix(rnorm(p * k), p, k) beta.true <- beta <- matrix(rnorm(p * k), p, k)
y <- rnorm(n) y <- rnorm(n)
Fy <- do.call(cbind, Map(function(slope, offset) { Fy <- do.call(cbind, Map(function(slope, offset) {
sin(slope * y + offset) sin(slope * y + offset)
@ -235,16 +190,15 @@ for (rep in 1:reps) {
head(rep(seq(1, ceiling(0.5 * k * r)), each = 2), k * r), head(rep(seq(1, ceiling(0.5 * k * r)), each = 2), k * r),
head(rep(c(0, pi / 2), ceiling(0.5 * k * r)), k * r) head(rep(c(0, pi / 2), ceiling(0.5 * k * r)), k * r)
)) ))
dim(Fy) <- c(n, k, r) X <- tcrossprod(Fy, kronecker(alpha, beta)) + CVarE:::rmvnorm(n, sigma = Delta)
X <- mlm(Fy, alpha.1, alpha.2, modes = 3:2)
X <- X + rtensornorm(n, 0, Delta.1, Delta.2, sample.mode = 1L)
hist.sim <- sim(X, Fy, alpha.1.true, alpha.2.true, max.iter = max.iter) hist.sim <- sim(X, Fy, shape = c(p, q, k, r), alpha.true, beta.true)
hist.sim$repetition <- rep hist.sim$repetition <- rep
hist <- rbind(hist, hist.sim) hist <- rbind(hist, hist.sim)
} }
# Save simulation results # Save simulation results
sim.name <- "sim02" sim.name <- "sim02"
datetime <- format(Sys.time(), "%Y%m%dT%H%M") datetime <- format(Sys.time(), "%Y%m%dT%H%M")
@ -253,107 +207,14 @@ saveRDS(hist, file = sprintf("%s_%s.rds", sim.name, datetime))
# for GGPlot2, as factors for grouping # for GGPlot2, as factors for grouping
hist$repetition <- factor(hist$repetition) hist$repetition <- factor(hist$repetition)
for (response in c("loss", "mse", "dist", "dist.alpha", "dist.beta")) { for (response in c("loss", "dist", "dist.alpha", "dist.beta")) {
for (fun in c("all", "mean", "median")) { for (fun in c("all", "mean", "median")) {
title <- paste(fun, paste(c("n", "p", "q", "k", "r"), c(n, p, q, k, r), sep = "=", collapse = ", ")) print(plot.hist2(hist, response, fun, title = fun) + coord_trans(x = "log1p"))
print(plot.hist2(hist, response, fun, title = title) +
coord_trans(x = "log1p"))
dev.print(png, file = sprintf("%s_%s_%s_%s.png", sim.name, datetime, response, fun), dev.print(png, file = sprintf("%s_%s_%s_%s.png", sim.name, datetime, response, fun),
width = 768, height = 768, res = 125) width = 768, height = 768, res = 125)
if (response != "loss") {
print(plot.hist2(hist, response, fun, title = title) +
coord_trans(x = "log1p", y = "log1p"))
dev.print(png, file = sprintf("%s_%s_%s_%s_log.png", sim.name, datetime, response, fun),
width = 768, height = 768, res = 125)
}
} }
} }
stats <- local({
# final result from history
sub <- na.omit(hist)
sub <- aggregate(sub, list(
method = sub$method, init = sub$init, repetition = sub$repetition
), tail, 1)
# aggregate statistics over repetitions
stats.mean <- aggregate(subset(sub, select = c("loss", "mse", "dist.alpha", "dist.beta")),
list(method = sub$method, init = sub$init), mean)
stats.sd <- aggregate(subset(sub, select = c("loss", "mse", "dist.alpha", "dist.beta")),
list(method = sub$method, init = sub$init), sd)
# merge mean and sd stats together
merge(stats.mean, stats.sd, by = c("method", "init"), suffixes = c(".mean", ".sd"))
})
print(stats, digits = 2)
# method init loss.mean mse.mean dist.alpha.mean dist.beta.mean loss.sd mse.sd dist.alpha.sd dist.beta.sd
# 1 approx ls 5457 0.99 0.033 0.030 163 0.025 0.017 0.012
# 2 approx vlp 6819 3.99 0.267 0.287 1995 12.256 0.448 0.457
# 3 base vlp -2642 1.82 0.248 0.271 1594 2.714 0.447 0.458
# 4 momentum ls -3479 0.99 0.037 0.035 95 0.025 0.017 0.015
# 5 momentum vlp -2704 1.78 0.233 0.260 1452 2.658 0.438 0.448
# 6 new ls -3479 0.99 0.037 0.035 95 0.025 0.017 0.015
# 7 new vlp -2704 1.78 0.233 0.260 1452 2.658 0.438 0.448
################################################################################
### Sim 3 ###
################################################################################
n <- 200
p <- c(7, 11, 5) # response dimensions (order 3)
q <- c(3, 6, 2) # predictor dimensions (order 3)
# currently only kpir.ls suppoert higher orders (order > 2)
sim3 <- function(X, Fy, alphas.true, max.iter = 500L) {
# Logger creator
logger <- function(name) {
eval(substitute(function(iter, loss, alpha, beta, ...) {
hist[iter + 1L, ] <<- c(
iter = iter,
loss = loss,
mse = (mse <- mean((X - mlm(Fy, alpha, beta, modes = 3:2))^2)),
(dist <- unlist(Map(dist.subspace, alphas, alphas.true)))
)
cat(sprintf(
"%s(%3d) | loss: %-12.4f - mse: %-12.4f - sum(dist): %-.4e\n",
name, iter, loss, sum(dist)
))
}, list(hist = as.symbol(paste0("hist.", name)))))
}
# Initialize logger history targets
hist.ls <-
do.call(data.frame, c(list(
iter = seq(0, r), loss = NA, mse = NA),
dist = rep(NA, length(dim(X)) - 1L)
))
# Approximated MLE with Nesterov Momentum
kpir.ls(X, Fy, sample.mode = 1L, max.iter = max.iter, logger = logger("ls"))
# Add method tags
hist.ls$method <- factor("ls")
# # Combine results and return
# rbind(hist.base, hist.new, hist.momentum, hist.approx, hist.ls)
hist.ls
}
sample.data3 <- function(n, p, q) {
stopifnot(length(p) == length(q))
stopifnot(all(q <= p))
Deltas <- Map(function(nrow) {
}, p)
list(X, Fy, alphas, Deltas)
}
################################################################################ ################################################################################
### WIP ### ### WIP ###
################################################################################ ################################################################################

View File

@ -1,26 +1,3 @@
################################################################################
### Sparce SIR against SIR ###
################################################################################
devtools::load_all('tensorPredictors/')
library(dr)
n <- 100
p <- 10
X <- rmvnorm(n, sigma = 0.5^abs(outer(1:p, 1:p, `-`)))
y <- rowSums(X[, 1:3]) + rnorm(n, 0.5)
B <- as.matrix(1:p <= 3) / sqrt(3)
dr.sir <- dr(y ~ X, method = 'sir', numdir = ncol(B))
B.sir <- dr.sir$evectors[, seq_len(ncol(B)), drop = FALSE]
dist.projection(B, B.sir)
B.poi <- with(GEP(X, y, 'sir'), {
POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = TRUE)$vectors
})
dist.projection(B, B.poi)
################################################################################ ################################################################################
### LDA (sparse Linear Discrimina Analysis) ### ### LDA (sparse Linear Discrimina Analysis) ###
@ -92,19 +69,19 @@ dataset <- function(nr) {
list(X = X, y = y, B = qr.Q(qr(B))) list(X = X, y = y, B = qr.Q(qr(B)))
} }
# Model 1 # # Model 1
fit <- with(dataset(1), { # fit <- with(dataset(1), {
with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C')) # with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C'))
}) # })
fit <- with(dataset(1), { # fit <- with(dataset(1), {
with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C')) # with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C'))
}) # })
fit <- with(dataset(1), { # fit <- with(dataset(1), {
with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = TRUE)) # with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = TRUE))
}) # })
fit <- with(dataset(1), { # fit <- with(dataset(1), {
with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C', use.C = TRUE)) # with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C', use.C = TRUE))
}) # })
# head(fit$vectors, 10) # head(fit$vectors, 10)
@ -113,24 +90,19 @@ nr.reps <- 100
sim <- replicate(nr.reps, { sim <- replicate(nr.reps, {
res <- double(0) res <- double(0)
for (model.nr in 1:5) { for (model.nr in 1:5) {
with(dataset(model.nr), { for (method in c('POI-C', 'FastPOI-C')) {
for (method in c('POI-C', 'FastPOI-C')) { for (use.C in c(FALSE, TRUE)) {
for (use.C in c(FALSE, TRUE)) { dist <- with(dataset(model.nr), {
fit <- with(GEP(X, y, 'lda'), { fit <- with(GEP(X, y, 'lda'), {
POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = use.C) POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = use.C)
}) })
dist <- dist.projection(B, fit$vectors) # dist.subspace(B, fit$vectors, is.ortho = FALSE, normalize = TRUE)
names(dist) <- paste('M', model.nr, '-', method, '-', use.C) dist.projection(B, fit$vectors)
res <<- c(res, dist) })
} names(dist) <- paste('M', model.nr, '-', method, '-', use.C)
res <- c(res, dist)
} }
fit <- with(GEP(X, y, 'lda'), { }
solve.gep(lhs, rhs, ncol(B))
})
dist <- dist.projection(B, fit$vectors)
names(dist) <- paste('M', model.nr, '- solve -', use.C)
res <<- c(res, dist)
})
} }
cat("Counter", (count <<- count + 1), "/", nr.reps, "\n") cat("Counter", (count <<- count + 1), "/", nr.reps, "\n")
res res

View File

@ -15,17 +15,15 @@ export(dist.projection)
export(dist.subspace) export(dist.subspace)
export(kpir.approx) export(kpir.approx)
export(kpir.base) export(kpir.base)
export(kpir.ls) export(kpir.kron)
export(kpir.momentum) export(kpir.momentum)
export(kpir.new) export(kpir.new)
export(mat) export(mat)
export(matpow) export(matpow)
export(matrixImage) export(matrixImage)
export(mcrossprod) export(mcrossprod)
export(mlm)
export(reduce) export(reduce)
export(rowKronecker) export(rowKronecker)
export(rtensornorm)
export(tensor_predictor) export(tensor_predictor)
export(ttm) export(ttm)
import(stats) import(stats)

View File

@ -84,9 +84,7 @@
#' Reduction and Variable Selection" By Xin Chen, Changliang Zou and #' Reduction and Variable Selection" By Xin Chen, Changliang Zou and
#' R. Dennis Cook. #' R. Dennis Cook.
#' #'
#' @note for speed reasons this functions attempts to use #' @suggest RSpectra
#' \code{\link[RSpectra]{eigs_sym}} if \pkg{\link{RSpectra}} is installed,
#' otherwise \code{\link{eigen}} is used which might be significantly slower.
#' #'
#' @export #' @export
CISE <- function(M, N, d = 1L, method = "PFC", max.iter = 100L, Theta = NULL, CISE <- function(M, N, d = 1L, method = "PFC", max.iter = 100L, Theta = NULL,

View File

@ -40,7 +40,7 @@ dist.subspace <- function (A, B, is.ortho = FALSE, normalize = FALSE,
if (normalize) { if (normalize) {
rankSum <- ncol(A) + ncol(B) rankSum <- ncol(A) + ncol(B)
c <- 1 / sqrt(max(1, min(rankSum, 2 * nrow(A) - rankSum))) c <- 1 / sqrt(min(rankSum, 2 * nrow(A) - rankSum))
} else { } else {
c <- sqrt(2) c <- sqrt(2)
} }

View File

@ -9,7 +9,6 @@
kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3, max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)), nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)),
max.init.iter = 20L, init.method = c("ls", "vlp"),
eps = .Machine$double.eps, eps = .Machine$double.eps,
logger = NULL logger = NULL
) { ) {
@ -18,7 +17,7 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
stopifnot(nrow(X) == NROW(Fy)) stopifnot(nrow(X) == NROW(Fy))
n <- nrow(X) # Number of observations n <- nrow(X) # Number of observations
# Check predictor dimensions # Get and check predictor dimensions
if (length(dim(X)) == 2L) { if (length(dim(X)) == 2L) {
stopifnot(!missing(shape)) stopifnot(!missing(shape))
stopifnot(ncol(X) == prod(shape[1:2])) stopifnot(ncol(X) == prod(shape[1:2]))
@ -27,16 +26,18 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
} else if (length(dim(X)) == 3L) { } else if (length(dim(X)) == 3L) {
p <- dim(X)[2] p <- dim(X)[2]
q <- dim(X)[3] q <- dim(X)[3]
dim(X) <- c(n, p * q)
} else { } else {
stop("'X' must be a matrix or 3-tensor") stop("'X' must be a matrix or 3-tensor")
} }
# Check response dimensions # Get and check response dimensions
if (!is.array(Fy)) { if (!is.array(Fy)) {
Fy <- as.array(Fy) Fy <- as.array(Fy)
} }
if (length(dim(Fy)) == 1L) { if (length(dim(Fy)) == 1L) {
k <- r <- 1L k <- r <- 1L
dim(Fy) <- c(n, 1L)
} else if (length(dim(Fy)) == 2L) { } else if (length(dim(Fy)) == 2L) {
stopifnot(!missing(shape)) stopifnot(!missing(shape))
stopifnot(ncol(Fy) == prod(shape[3:4])) stopifnot(ncol(Fy) == prod(shape[3:4]))
@ -45,44 +46,34 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
} else if (length(dim(Fy)) == 3L) { } else if (length(dim(Fy)) == 3L) {
k <- dim(Fy)[2] k <- dim(Fy)[2]
r <- dim(Fy)[3] r <- dim(Fy)[3]
dim(Fy) <- c(n, k * r)
} else { } else {
stop("'Fy' must be a vector, matrix or 3-tensor") stop("'Fy' must be a vector, matrix or 3-tensor")
} }
### Step 1: (Approx) Least Squares initial estimate ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
init.method <- match.arg(init.method) # Vectorize
if (init.method == "ls") { dim(Fy) <- c(n, k * r)
# De-Vectroize (from now on tensor arithmetics) dim(X) <- c(n, p * q)
dim(Fy) <- c(n, k, r) # Solve
dim(X) <- c(n, p, q) cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
if (n <= k * r || qr(cpFy)$rank < k * r) {
ls <- kpir.ls(X, Fy, max.iter = max.init.iter, sample.mode = 1L, eps = eps) # In case of under-determined system replace the inverse in the normal
c(beta0, alpha0) %<-% ls$alphas # equation by the Moore-Penrose Pseudo Inverse
} else { # Van Loan and Pitsianis B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
# Vectorize } else {
dim(Fy) <- c(n, k * r) # Compute OLS estimate by the Normal Equation
dim(X) <- c(n, p * q) B <- t(solve(cpFy, crossprod(Fy, X)))
# solution for `X = Fy B' + epsilon`
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
if (n <= k * r || qr(cpFy)$rank < k * r) {
# In case of under-determined system replace the inverse in the normal
# equation by the Moore-Penrose Pseudo Inverse
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
} else {
# Compute OLS estimate by the Normal Equation
B <- t(solve(cpFy, crossprod(Fy, X)))
}
# De-Vectroize (from now on tensor arithmetics)
dim(Fy) <- c(n, k, r)
dim(X) <- c(n, p, q)
# Decompose `B = alpha x beta` into `alpha` and `beta`
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
} }
# De-Vectroize (from now on tensor arithmetics)
dim(Fy) <- c(n, k, r)
dim(X) <- c(n, p, q)
# Decompose `B = alpha x beta` into `alpha` and `beta`
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
# Compute residuals # Compute residuals
R <- X - (Fy %x_3% alpha0 %x_2% beta0) R <- X - (Fy %x_3% alpha0 %x_2% beta0)

View File

@ -0,0 +1,216 @@
#' Gradient Descent Bases Tensor Predictors method with Nesterov Accelerated
#' Momentum and Kronecker structure assumption for the residual covariance
#' `Delta = Delta.1 %x% Delta.2` (simple plugin version!)
#'
#' @export
kpir.kron <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
nesterov.scaling = function(a, t) { 0.5 * (1 + sqrt(1 + (2 * a)^2)) },
eps = .Machine$double.eps,
logger = NULL
) {
# Check if X and Fy have same number of observations
stopifnot(nrow(X) == NROW(Fy))
n <- nrow(X) # Number of observations
# Get and check predictor dimensions (convert to 3-tensor if needed)
if (length(dim(X)) == 2L) {
stopifnot(!missing(shape))
stopifnot(ncol(X) == prod(shape[1:2]))
p <- as.integer(shape[1]) # Predictor "height"
q <- as.integer(shape[2]) # Predictor "width"
} else if (length(dim(X)) == 3L) {
p <- dim(X)[2]
q <- dim(X)[3]
} else {
stop("'X' must be a matrix or 3-tensor")
}
# Get and check response dimensions (and convert to 3-tensor if needed)
if (!is.array(Fy)) {
Fy <- as.array(Fy)
}
if (length(dim(Fy)) == 1L) {
k <- r <- 1L
dim(Fy) <- c(n, 1L, 1L)
} else if (length(dim(Fy)) == 2L) {
stopifnot(!missing(shape))
stopifnot(ncol(Fy) == prod(shape[3:4]))
k <- as.integer(shape[3]) # Response functional "height"
r <- as.integer(shape[4]) # Response functional "width"
} else if (length(dim(Fy)) == 3L) {
k <- dim(Fy)[2]
r <- dim(Fy)[3]
} else {
stop("'Fy' must be a vector, matrix or 3-tensor")
}
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
# Vectorize
dim(Fy) <- c(n, k * r)
dim(X) <- c(n, p * q)
# Solve
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
if (n <= k * r || qr(cpFy)$rank < k * r) {
# In case of under-determined system replace the inverse in the normal
# equation by the Moore-Penrose Pseudo Inverse
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
} else {
# Compute OLS estimate by the Normal Equation
B <- t(solve(cpFy, crossprod(Fy, X)))
}
# De-Vectroize (from now on tensor arithmetics)
dim(Fy) <- c(n, k, r)
dim(X) <- c(n, p, q)
# Decompose `B = alpha x beta` into `alpha` and `beta`
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
# Compute residuals
resid <- X - (Fy %x_3% alpha0 %x_2% beta0)
# Covariance estimate
Delta.1 <- tcrossprod(mat(resid, 3))
Delta.2 <- tcrossprod(mat(resid, 2))
tr <- sum(diag(Delta.1))
Delta.1 <- Delta.1 / sqrt(n * tr)
Delta.2 <- Delta.2 / sqrt(n * tr)
# Transformed Residuals
resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2)
# Evaluate negative log-likelihood
loss <- 0.5 * (n * (p * log(det(Delta.1)) + q * log(det(Delta.2))) +
sum(resid.trans * resid))
# Call history callback (logger) before the first iterate
if (is.function(logger)) {
logger(0L, loss, alpha0, beta0, Delta.1, Delta.2, NA)
}
### Step 2: MLE with LS solution as starting value
a0 <- 0
a1 <- 1
alpha1 <- alpha0
beta1 <- beta0
# main descent loop
no.nesterov <- TRUE
break.reason <- NA
for (iter in seq_len(max.iter)) {
if (no.nesterov) {
# without extrapolation as fallback
S.alpha <- alpha1
S.beta <- beta1
} else {
# extrapolation using previous direction
S.alpha <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0)
S.beta <- beta1 + ((a0 - 1) / a1) * ( beta1 - beta0)
}
# Extrapolated Residuals
resid <- X - (Fy %x_3% S.alpha %x_2% S.beta)
# Covariance Estimates
Delta.1 <- tcrossprod(mat(resid, 3))
Delta.2 <- tcrossprod(mat(resid, 2))
tr <- sum(diag(Delta.1))
Delta.1 <- Delta.1 / sqrt(n * tr)
Delta.2 <- Delta.2 / sqrt(n * tr)
# Transform Residuals
resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2)
# Calculate Gradients
grad.alpha <- tcrossprod(mat(resid.trans, 3), mat(Fy %x_2% S.beta, 3))
grad.beta <- tcrossprod(mat(resid.trans, 2), mat(Fy %x_3% S.alpha, 2))
# Backtracking line search (Armijo type)
# The `inner.prod` is used in the Armijo break condition but does not
# depend on the step size.
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
# backtracking loop
for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) {
# Update `alpha` and `beta` (note: add(+), the gradients are already
# pointing into the negative slope direction of the loss cause they are
# the gradients of the log-likelihood [NOT the negative log-likelihood])
alpha.temp <- S.alpha + delta * grad.alpha
beta.temp <- S.beta + delta * grad.beta
# Update Residuals, Covariance and transformed Residuals
resid <- X - (Fy %x_3% alpha.temp %x_2% beta.temp)
Delta.1 <- tcrossprod(mat(resid, 3))
Delta.2 <- tcrossprod(mat(resid, 2))
tr <- sum(diag(Delta.1))
Delta.1 <- Delta.1 / sqrt(n * tr)
Delta.2 <- Delta.2 / sqrt(n * tr)
resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2)
# Evaluate negative log-likelihood
loss.temp <- 0.5 * (n * (p * log(det(Delta.1)) + q * log(det(Delta.2)))
+ sum(resid.trans * resid))
# Armijo line search break condition
if (loss.temp <= loss - 0.1 * delta * inner.prod) {
break
}
}
# Call logger (invoke history callback)
if (is.function(logger)) {
logger(iter, loss.temp, alpha.temp, beta.temp, Delta.1, Delta.2, delta)
}
# Ensure descent
if (loss.temp < loss) {
alpha0 <- alpha1
alpha1 <- alpha.temp
beta0 <- beta1
beta1 <- beta.temp
# check break conditions (in descent case)
if (mean(abs(alpha1)) + mean(abs(beta1)) < eps) {
break.reason <- "alpha, beta numerically zero"
break # basically, estimates are zero -> stop
}
if (inner.prod < eps * (p * q + r * k)) {
break.reason <- "mean squared gradient is smaller than epsilon"
break # mean squared gradient is smaller than epsilon -> stop
}
if (abs(loss.temp - loss) < eps) {
break.reason <- "decrease is too small (slow)"
break # decrease is too small (slow) -> stop
}
loss <- loss.temp
no.nesterov <- FALSE # always reset
} else if (!no.nesterov) {
no.nesterov <- TRUE # retry without momentum
next
} else {
break.reason <- "failed even without momentum"
break # failed even without momentum -> stop
}
# update momentum scaling
a0 <- a1
a1 <- nesterov.scaling(a1, iter)
# Set next iter starting step.size to line searched step size
# (while allowing it to encrease)
step.size <- 1.618034 * delta
}
list(
loss = loss,
alpha = alpha1, beta = beta1,
Delta.1 = Delta.1, Delta.2 = Delta.2,
break.reason = break.reason
)
}

View File

@ -1,77 +0,0 @@
#' Per mode (axis) alternating least squares estimate
#'
#' @param sample.mode index of the sample mode, a.k.a. observation axis index
#'
#' @export
kpir.ls <- function(X, Fy, max.iter = 20L, sample.mode = 1L,
eps = .Machine$double.eps, logger = NULL
) {
# Check if X and Fy have same number of observations
if (!is.array(Fy)) {
# scalar response case (add new axis of size 1)
dim(Fy) <- local({
dims <- rep(1, length(dim(X)))
dims[sample.mode] <- length(Fy)
dims
})
} else {
stopifnot(dim(X)[sample.mode] == dim(Fy)[sample.mode])
}
# Check dimensions
stopifnot(length(dim(X)) == length(dim(Fy)))
stopifnot(dim(X)[sample.mode] == dim(Fy)[sample.mode])
# and model constraints
stopifnot(all(dim(Fy) <= dim(X)))
# mode index sequence (exclude sample mode, a.k.a. observation axis)
modes <- seq_along(dim(X))[-sample.mode]
### Step 1: initial per mode estimates
alphas <- Map(function(mode, ncol) {
La.svd(mcrossprod(X, mode = mode), ncol)$u
}, modes, dim(Fy)[modes])
# # Scaling of alpha, such that `tr(alpha_i' alpha_i) = tr(alpha_j' alpha_j)``
# # for `i, j = 1, ..., r`.
# traces <- unlist(Map(function(alpha) sum(alpha^2)))
# alphas <- Map(`*`, prod(traces)^(1 / length(alphas)) / traces, alphas)
# Call history callback (logger) before the first iteration
if (is.function(logger)) { do.call(logger, c(0L, NA, rev(alphas))) }
### Step 2: iterate per mode (axis) least squares estimates
for (iter in seq_len(max.iter)) {
# cyclic iterate over modes
for (j in seq_along(modes)) {
# least squares solution for `alpha_j | alpha_i, i != j`
Z <- mlm(Fy, alphas[-j], modes = modes[-j])
alphas[[j]] <- t(solve(mcrossprod(Z, mode = modes[j]),
tcrossprod(mat(Z, modes[j]), mat(X, modes[j]))))
# TODO: alphas[[j]] <- t(solve(mcrossprod(Z, j), mcrossprod(Z, X, j)))
}
# # Scaling of alpha, such that `tr(alpha_i' alpha_i) = tr(alpha_j' alpha_j)``
# # for `i, j = 1, ..., r`.
# traces <- unlist(Map(function(alpha) sum(alpha^2)))
# alphas <- Map(`*`, prod(traces)^(1 / length(alphas)) / traces, alphas)
# Call logger (invoke history callback)
if (is.function(logger)) { do.call(logger, c(iter, NA, rev(alphas))) }
# TODO: add some kind of break condition
}
### Step 3: Moment estimates for `Delta_i`
# Residuals
R <- X - mlm(Fy, alphas, modes = modes)
# Moment estimates for `Delta_i`s
Deltas <- Map(mcrossprod, list(R), mode = modes)
Deltas <- Map(`*`, 1 / dim(X)[sample.mode], Deltas)
list(
alphas = structure(alphas, names = as.character(modes)),
Deltas = structure(Deltas, names = as.character(modes))
)
}

View File

@ -5,7 +5,6 @@
kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3, max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)), nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)),
max.init.iter = 20L, init.method = c("ls", "vlp"),
eps = .Machine$double.eps, eps = .Machine$double.eps,
logger = NULL logger = NULL
) { ) {
@ -49,31 +48,20 @@ kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
} }
### Step 1: (Approx) Least Squares initial estimate ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
init.method <- match.arg(init.method) cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
if (init.method == "ls") { if (n <= k * r || qr(cpFy)$rank < k * r) {
dim(X) <- c(n, p, q) # In case of under-determined system replace the inverse in the normal
dim(Fy) <- c(n, k, r) # equation by the Moore-Penrose Pseudo Inverse
ls <- kpir.ls(X, Fy, max.iter = max.init.iter, sample.mode = 1L, eps = eps) B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
c(beta0, alpha0) %<-% ls$alphas } else {
dim(X) <- c(n, p * q) # Compute OLS estimate by the Normal Equation
dim(Fy) <- c(n, k * r) B <- t(solve(cpFy, crossprod(Fy, X)))
} else { # Van Loan and Pitsianis
# solution for `X = Fy B' + epsilon`
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
if (n <= k * r || qr(cpFy)$rank < k * r) {
# In case of under-determined system replace the inverse in the normal
# equation by the Moore-Penrose Pseudo Inverse
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
} else {
# Compute OLS estimate by the Normal Equation
B <- t(solve(cpFy, crossprod(Fy, X)))
}
# Decompose `B = alpha x beta` into `alpha` and `beta`
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
} }
# Decompose `B = alpha x beta` into `alpha` and `beta`
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
# Compute residuals # Compute residuals
resid <- X - tcrossprod(Fy, kronecker(alpha0, beta0)) resid <- X - tcrossprod(Fy, kronecker(alpha0, beta0))

View File

@ -3,7 +3,6 @@
#' @export #' @export
kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3, max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
max.init.iter = 20L, init.method = c("ls", "vlp"),
eps = .Machine$double.eps, eps = .Machine$double.eps,
logger = NULL logger = NULL
) { ) {
@ -46,31 +45,20 @@ kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
stop("'Fy' must be a vector, matrix or 3-tensor") stop("'Fy' must be a vector, matrix or 3-tensor")
} }
### Step 1: (Approx) Least Squares initial estimate ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
init.method <- match.arg(init.method) cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
if (init.method == "ls") { if (n <= k * r || qr(cpFy)$rank < k * r) {
dim(X) <- c(n, p, q) # In case of under-determined system replace the inverse in the normal
dim(Fy) <- c(n, k, r) # equation by the Moore-Penrose Pseudo Inverse
ls <- kpir.ls(X, Fy, max.iter = max.init.iter, sample.mode = 1L, eps = eps) B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
c(beta, alpha) %<-% ls$alphas } else {
dim(X) <- c(n, p * q) # Compute OLS estimate by the Normal Equation
dim(Fy) <- c(n, k * r) B <- t(solve(cpFy, crossprod(Fy, X)))
} else { # Van Loan and Pitsianis
# solution for `X = Fy B' + epsilon`
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
if (n <= k * r || qr(cpFy)$rank < k * r) {
# In case of under-determined system replace the inverse in the normal
# equation by the Moore-Penrose Pseudo Inverse
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
} else {
# Compute OLS estimate by the Normal Equation
B <- t(solve(cpFy, crossprod(Fy, X)))
}
# Decompose `B = alpha x beta` into `alpha` and `beta`
c(alpha, beta) %<-% approx.kronecker(B, c(q, r), c(p, k))
} }
# Decompose `B = alpha x beta` into `alpha` and `beta`
c(alpha, beta) %<-% approx.kronecker(B, c(q, r), c(p, k))
# Compute residuals # Compute residuals
resid <- X - tcrossprod(Fy, kronecker(alpha, beta)) resid <- X - tcrossprod(Fy, kronecker(alpha, beta))

View File

@ -1,71 +0,0 @@
#' Multi Linear Multiplication
#'
#' C = A x { B1, ..., Br }
#'
#' @param A tensor (multi-linear array)
#' @param B matrix or list of matrices
#' @param ... further matrices, concatenated with \code{B}
#' @param modes integer sequence of the same length as number of matrices
#' supplied (in \code{B} and \code{...})
#'
#' @examples
#' # general usage
#' dimA <- c(3, 17, 19, 2)
#' dimC <- c(7, 11, 13, 5)
#' A <- array(rnorm(prod(dimA)), dim = dimA)
#' B <- Map(function(p, q) matrix(rnorm(p * q), p, q), dimC, dimA)
#' C1 <- mlm(A, B)
#' C2 <- mlm(A, B[[1]], B[[2]], B[[3]], B[[4]])
#' C3 <- mlm(A, B[[3]], B[[1]], B[[2]], B[[4]], modes = c(3, 1, 2, 4))
#' C4 <- mlm(A, B[1:3], B[[4]])
#' stopifnot(all.equal(C1, C2))
#' stopifnot(all.equal(C1, C3))
#' stopifnot(all.equal(C1, C4))
#'
#' # selected modes
#' C1 <- mlm(A, B, modes = 2:3)
#' C2 <- mlm(A, B[[2]], B[[3]], modes = 2:3)
#' C3 <- ttm(ttm(A, B[[2]], 2), B[[3]], 3)
#' stopifnot(all.equal(C1, C2))
#' stopifnot(all.equal(C1, C3))
#'
#' # analog to matrix multiplication
#' A <- matrix(rnorm( 6), 2, 3)
#' B <- matrix(rnorm(12), 3, 4)
#' C <- matrix(rnorm(20), 4, 5)
#' stopifnot(all.equal(
#' A %*% B %*% t(C),
#' mlm(B, list(A, C))
#' ))
#'
#' # usage with repeated modes (non commutative)
#' dimA <- c(3, 17, 19, 2)
#' A <- array(rnorm(prod(dimA)), dim = dimA)
#' B1 <- matrix(rnorm(9), 3, 3)
#' B2 <- matrix(rnorm(9), 3, 3)
#' C <- matrix(rnorm(4), 2, 2)
#' # same modes do NOT commute
#' all.equal(
#' mlm(A, B1, B2, C, modes = c(1, 1, 4)), # NOT equal!
#' mlm(A, B2, B1, C, modes = c(1, 1, 4))
#' )
#' # but different modes do commute
#' P1 <- mlm(A, C, B1, B2, modes = c(4, 1, 1))
#' P2 <- mlm(A, B1, C, B2, modes = c(1, 4, 1))
#' P3 <- mlm(A, B1, B2, C, modes = c(1, 1, 4))
#' stopifnot(all.equal(P1, P2))
#' stopifnot(all.equal(P1, P3))
#'
#' @export
mlm <- function(A, B, ..., modes = seq_along(B)) {
# Collect all matrices in `B`
B <- c(if (is.matrix(B)) list(B) else B, list(...))
# iteratively apply Tensor Times Matrix multiplication over modes
for (i in seq_along(modes)) {
A <- ttm(A, B[[i]], modes[i])
}
# return result tensor
A
}