add: kpir.ls as initial value estimate (alternative to VLP)
This commit is contained in:
parent
49bf4bdf20
commit
a2963024ef
|
@ -81,8 +81,12 @@ vignettes/*.pdf
|
||||||
*.RData
|
*.RData
|
||||||
*.Rdata
|
*.Rdata
|
||||||
|
|
||||||
|
# R Profiling
|
||||||
|
*.Rprof
|
||||||
|
|
||||||
# VSCode configuration
|
# VSCode configuration
|
||||||
.vscode/
|
.vscode/
|
||||||
|
.lintr
|
||||||
|
|
||||||
## Archives, compressed files/folders
|
## Archives, compressed files/folders
|
||||||
# Output files from R CMD build
|
# Output files from R CMD build
|
||||||
|
@ -104,3 +108,8 @@ wip/
|
||||||
|
|
||||||
mlda_analysis/
|
mlda_analysis/
|
||||||
References/
|
References/
|
||||||
|
|
||||||
|
# Images (except images used in LaTeX)
|
||||||
|
*.png
|
||||||
|
*.svg
|
||||||
|
!**/LaTeX/*.png
|
||||||
|
|
178
LaTeX/main.tex
178
LaTeX/main.tex
|
@ -49,6 +49,7 @@
|
||||||
\newcommand{\mat}[1]{\boldsymbol{#1}}
|
\newcommand{\mat}[1]{\boldsymbol{#1}}
|
||||||
\newcommand{\ten}[1]{\mathcal{#1}}
|
\newcommand{\ten}[1]{\mathcal{#1}}
|
||||||
\renewcommand{\vec}{\operatorname{vec}}
|
\renewcommand{\vec}{\operatorname{vec}}
|
||||||
|
\newcommand{\dist}{\operatorname{dist}}
|
||||||
\DeclareMathOperator{\kron}{\otimes} % Kronecker Product
|
\DeclareMathOperator{\kron}{\otimes} % Kronecker Product
|
||||||
\DeclareMathOperator{\hada}{\odot} % Hadamard Product
|
\DeclareMathOperator{\hada}{\odot} % Hadamard Product
|
||||||
\newcommand{\ttm}[1][n]{\times_{#1}} % n-mode product (Tensor Times Matrix)
|
\newcommand{\ttm}[1][n]{\times_{#1}} % n-mode product (Tensor Times Matrix)
|
||||||
|
@ -62,8 +63,8 @@
|
||||||
\DeclareMathOperator*{\argmax}{{arg\,max}}
|
\DeclareMathOperator*{\argmax}{{arg\,max}}
|
||||||
\newcommand{\D}{\textnormal{D}}
|
\newcommand{\D}{\textnormal{D}}
|
||||||
\renewcommand{\d}{\textnormal{d}}
|
\renewcommand{\d}{\textnormal{d}}
|
||||||
\renewcommand{\t}[1]{{{#1}'}}
|
\renewcommand{\t}[1]{{#1^{\prime}}}
|
||||||
\newcommand{\pinv}[1]{{{#1}^{\dagger}}} % `Moore-Penrose pseudoinverse`
|
\newcommand{\pinv}[1]{{#1^{\dagger}}} % `Moore-Penrose pseudoinverse`
|
||||||
\newcommand{\todo}[1]{{\color{red}TODO: #1}}
|
\newcommand{\todo}[1]{{\color{red}TODO: #1}}
|
||||||
|
|
||||||
% \DeclareFontFamily{U}{mathx}{\hyphenchar\font45}
|
% \DeclareFontFamily{U}{mathx}{\hyphenchar\font45}
|
||||||
|
@ -117,17 +118,15 @@ Let $\ten{X}$ be a multi-dimensional array random variable of order (rank) $r$ w
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
Its density is given by
|
Its density is given by
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1}
|
f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{p_{-i}}} \Big)^{-1}
|
||||||
\exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
|
\exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
where $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
|
where $p_{\lnot i} = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
\vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
|
\vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
with $p = \prod_{i = 1}^r p_i$.
|
with $p = \prod_{i = 1}^r p_i$.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
\begin{theorem}[Tensor Normal to Multi-Variate Normal equivalence]
|
\begin{theorem}[Tensor Normal to Multi-Variate Normal equivalence]
|
||||||
For a multi-dimensional random variable $\ten{X}$ of order $r$ with dimensions $p_1\times ..., p_r$. Let $\ten{\mu}$ be the mean of the same order and dimensions as $\ten{X}$ and the mode covariance matrices $\mat{\Delta}_i$ of dimensions $p_i\times p_i$ for $i = 1, ..., n$. Then the tensor normal distribution is equivalent to the multi-variate normal distribution by the relation
|
For a multi-dimensional random variable $\ten{X}$ of order $r$ with dimensions $p_1\times ..., p_r$. Let $\ten{\mu}$ be the mean of the same order and dimensions as $\ten{X}$ and the mode covariance matrices $\mat{\Delta}_i$ of dimensions $p_i\times p_i$ for $i = 1, ..., n$. Then the tensor normal distribution is equivalent to the multi-variate normal distribution by the relation
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
|
@ -148,12 +147,12 @@ with $p = \prod_{i = 1}^r p_i$.
|
||||||
Next, using a property of the determinant of a Kronecker product $|\mat{\Delta}_1\otimes\mat{\Delta}_2| = |\mat{\Delta}_1|^{p_2}|\mat{\Delta}_2|^{p_1}$ yields
|
Next, using a property of the determinant of a Kronecker product $|\mat{\Delta}_1\otimes\mat{\Delta}_2| = |\mat{\Delta}_1|^{p_2}|\mat{\Delta}_2|^{p_1}$ yields
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
|\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
|
|\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
|
||||||
= |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_2|^{p_1}|\mat{\Delta}_1|^{q_1}
|
= |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_2|^{p_1}|\mat{\Delta}_1|^{p_{\lnot 1}}
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
where $q_i = \prod_{j \neq i}p_j$. By induction over $r$ the relation
|
where $p_{\lnot i} = \prod_{j \neq i}p_j$. By induction over $r$ the relation
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
|\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
|
|\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
|
||||||
= \prod_{i = 1}^r |\mat{\Delta}_i|^{q_i}
|
= \prod_{i = 1}^r |\mat{\Delta}_i|^{p_{\lnot i}}
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
holds for arbitrary order $r$. Substituting into the Tensor Normal density leads to
|
holds for arbitrary order $r$. Substituting into the Tensor Normal density leads to
|
||||||
\begin{align*}
|
\begin{align*}
|
||||||
|
@ -163,6 +162,14 @@ with $p = \prod_{i = 1}^r p_i$.
|
||||||
which is the Multi-Variate Normal density of the $p$ dimensional vector $\vec\ten{X}$.
|
which is the Multi-Variate Normal density of the $p$ dimensional vector $\vec\ten{X}$.
|
||||||
\end{proof}
|
\end{proof}
|
||||||
|
|
||||||
|
When sampling from the Multi-Array Normal one way is to sample from the Multi-Variate Normal and then reshaping the result, but this is usually very inefficient because it requires to store the multi-variate covariance matrix which is very big. Instead, it is more efficient to sample $\ten{Z}$ as a tensor of the same shape as $\ten{X}$ with standard normal entries and then transform the $\ten{Z}$ to follow the Multi-Array Normal as follows
|
||||||
|
\begin{displaymath}
|
||||||
|
\ten{Z}\sim\mathcal{TN}(0, \mat{I}_{p_1}, ..., \mat{I}_{p_r})
|
||||||
|
\quad\Rightarrow\quad
|
||||||
|
\ten{X} = \ten{Z}\times\{\mat{\Delta}_1^{1/2}, ..., \mat{\Delta}_r^{1/2}\} + \mu\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
|
||||||
|
\end{displaymath}
|
||||||
|
where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal.
|
||||||
|
|
||||||
|
|
||||||
\section{Introduction}
|
\section{Introduction}
|
||||||
We assume the model
|
We assume the model
|
||||||
|
@ -385,7 +392,7 @@ The unbiasednes comes directly from the following short computation;
|
||||||
= \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p (\mat{\Delta}_{2})_{l,l}(\mat{\Delta}_{1})_{j,k}
|
= \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p (\mat{\Delta}_{2})_{l,l}(\mat{\Delta}_{1})_{j,k}
|
||||||
= (\mat\Delta_1\tr(\mat\Delta_2))_{j,k}.
|
= (\mat\Delta_1\tr(\mat\Delta_2))_{j,k}.
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
which means that $\E\widetilde{\mat\Delta}_1 = \mat\Delta_1\tr(\mat\Delta_2)$ and in analogy $\E\widetilde{\mat\Delta}_2 = \mat\Delta_2\tr(\mat\Delta_1)$. Now, we need to handle the scaling which can be estimated unbiased by
|
which means that $\E\widetilde{\mat\Delta}_1 = \mat\Delta_1\tr(\mat\Delta_2)$ and in analogy $\E\widetilde{\mat\Delta}_2 = \mat\Delta_2\tr(\mat\Delta_1)$. Now, we need to handle the scaling which can be estimated unbiasedly by
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
\tilde{s} = \frac{1}{n}\sum_{i = 1}^n \|\mat{R}_i\|_F^2
|
\tilde{s} = \frac{1}{n}\sum_{i = 1}^n \|\mat{R}_i\|_F^2
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
|
@ -481,7 +488,7 @@ Putting it all together
|
||||||
&\hspace{3em}\hspace{3em} \hspace{4.7em} - \tilde{s}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1} \t{\mat{R}_i} \widetilde{\mat{\Delta}}_2^{-1})
|
&\hspace{3em}\hspace{3em} \hspace{4.7em} - \tilde{s}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1} \t{\mat{R}_i} \widetilde{\mat{\Delta}}_2^{-1})
|
||||||
\Big)\d\mat{\beta}\bigg).
|
\Big)\d\mat{\beta}\bigg).
|
||||||
\end{align*}
|
\end{align*}
|
||||||
Observe that the bracketed expressions before $\d\mat{\alpha}$ and $\d\mat{\beta}$ are transposed of each other. Lets denote the expression for $\d\mat{\alpha}$ as $\mat{G}_i$ which has the form
|
Observe that the bracketed expressions before $\d\mat{\alpha}$ and $\d\mat{\beta}$ are transposes. Lets denote the expression for $\d\mat{\alpha}$ as $\mat{G}_i$ which has the form
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
\mat{G}_i
|
\mat{G}_i
|
||||||
= (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\mat{R}_i
|
= (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\mat{R}_i
|
||||||
|
@ -521,7 +528,7 @@ Let $\ten{X}, \ten{F}$ be order (rank) $r$ tensors of dimensions $p_1\times ...
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
\ten{X}_i = \ten{\mu} + \ten{F}_i\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}_i
|
\ten{X}_i = \ten{\mu} + \ten{F}_i\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}_i
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}\sim\mathcal{TM}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations
|
where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
\ten{X} = \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon}
|
\ten{X} = \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon}
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
|
@ -529,48 +536,87 @@ which is almost identical as the observations $\ten{X}_i, \ten{F}_i$ are stacked
|
||||||
|
|
||||||
In the following we assume w.l.o.g that $\ten{\mu} = 0$, as if this is not true we simply replace $\ten{X}_i$ with $\ten{X}_i - \ten{\mu}$ for $i = 1, ..., n$ before collecting all the observations in the response tensor $\ten{X}$.
|
In the following we assume w.l.o.g that $\ten{\mu} = 0$, as if this is not true we simply replace $\ten{X}_i$ with $\ten{X}_i - \ten{\mu}$ for $i = 1, ..., n$ before collecting all the observations in the response tensor $\ten{X}$.
|
||||||
|
|
||||||
The goal here is to find reasonable estimates for $\mat{\alpha}_i$, $i = 1, ..., n$ for the mean model
|
The goal here is to find reasonable estimates for $\mat{\alpha}_j$, $j = 1, ..., n$ for the mean model
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
\E \ten{X}|\ten{F}, \mat{\alpha}_1, ..., \mat{\alpha}_r = \ten{F}\times\{\mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n\}
|
\E \ten{X}|\ten{F}, \mat{\alpha}_1, ..., \mat{\alpha}_r = \ten{F}\times\{\mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n\}
|
||||||
= \ten{F}\times_{i\in[r]}\mat{\alpha}_i.
|
= \ten{F}\times_{j\in[r]}\mat{\alpha}_j.
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
Under the mean model we have using the general mode product relation $(\ten{A}\times_j\mat{B})_{(j)} = \mat{B}\ten{A}_{(j)}$ we get
|
Under the mean model we have using the general mode product relation $(\ten{A}\times_j\mat{B})_{(j)} = \mat{B}\ten{A}_{(j)}$ we get
|
||||||
\begin{align*}
|
\begin{align*}
|
||||||
\ten{X}_{(j)}\t{\ten{X}_{(j)}} \overset{\text{SVD}}{=} \mat{U}_j\mat{D}_j\t{\mat{U}_j}
|
\ten{X}_{(j)}\t{\ten{X}_{(j)}} \overset{\text{SVD}}{=} \mat{U}_j\mat{D}_j\t{\mat{U}_j}
|
||||||
= \mat{\alpha}_j(\ten{F}\times_{i\in[r]\backslash j}\mat{\alpha}_i)_{(j)}
|
= \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}
|
||||||
\t{(\ten{F}\times_{i\in[r]\backslash j}\mat{\alpha}_i)_{(j)}}\t{\mat{\alpha}_j}
|
\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}\t{\mat{\alpha}_j}
|
||||||
\end{align*}
|
\end{align*}
|
||||||
for the $j = 1, ..., r$ modes. Using this relation we construct an iterative estimation process by setting the initial estimates of $\hat{\mat{\alpha}}_j^{(0)} = \mat{U}_j[, 1:q_j]$ which are the first $q_j$ columns of $\mat{U}_j$.
|
for the $j = 1, ..., r$ modes. Using this relation we construct an iterative estimation process by setting the initial estimates of $\hat{\mat{\alpha}}_j^{(0)} = \mat{U}_j[, 1:q_j]$ which are the first $q_j$ columns of $\mat{U}_j$.
|
||||||
|
|
||||||
\todo{continue}
|
For getting least squares estimates for $\mat{\alpha}_j$, $j = 1, ..., r$ we observe that by matricization of the mean model
|
||||||
|
\begin{displaymath}
|
||||||
|
\ten{X}_{(j)} = (\ten{F}\times_{k\in[r]}\mat{\alpha}_k)_{(j)} = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}
|
||||||
|
\end{displaymath}
|
||||||
|
leads to normal equations for each $\mat{\alpha}_j$, $j = 1, ..., r$
|
||||||
|
\begin{displaymath}
|
||||||
|
\ten{X}_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}
|
||||||
|
\end{displaymath}
|
||||||
|
where the normal equations for $\mat{\alpha}_j$ depend on all the other $\mat{\alpha}_k$. With the initial estimates from above this allows an alternating approach. Index with $t = 1, ...$ the current iteration, then a new estimate $\widehat{\mat{\alpha}}_j^{(t)}$ given the previous estimates $\widehat{\mat{\alpha}}_k^{(t-1)}$, $k = 1, ..., r$ is computed as
|
||||||
|
\begin{displaymath}
|
||||||
|
\widehat{\mat{\alpha}}_j^{(t)} =
|
||||||
|
\ten{X}_{(j)}
|
||||||
|
\t{\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}}
|
||||||
|
\left(
|
||||||
|
\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}
|
||||||
|
\t{\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}}
|
||||||
|
\right)^{-1}
|
||||||
|
\end{displaymath}
|
||||||
|
for $j = 1, ..., r$ until convergence or a maximum number of iterations is exceeded. The final estimates are the least squares estimates by this procedure.
|
||||||
|
|
||||||
|
|
||||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
%%% Numerical Examples %%%
|
%%% Numerical Examples %%%
|
||||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
\section{Numerical Examples}
|
\section{Numerical Examples}
|
||||||
The first example (which by it self is \emph{not} exemplary) is the estimation with parameters $n = 200$, $p = 11$, $q = 5$, $k = 14$ and $r = 9$. The ``true'' matrices $\mat\alpha$, $\mat\beta$ generated by sampling there elements i.i.d. standard normal like the responses $y$. Then, for each observation, $\mat{f}_y$ is computed as $\sin(s_{i, j} y + o_{i j})$ \todo{ properly describe} to fill the elements of $\mat{f}_y$. Then the $\mat{X}$'s are samples as
|
% The first example (which by it self is \emph{not} exemplary) is the estimation with parameters $n = 200$, $p = 11$, $q = 5$, $k = 14$ and $r = 9$. The ``true'' matrices $\mat\alpha$, $\mat\beta$ generated by sampling there elements i.i.d. standard normal like the responses $y$. Then, for each observation, $\mat{f}_y$ is computed as $\sin(s_{i, j} y + o_{i j})$ \todo{ properly describe} to fill the elements of $\mat{f}_y$. Then the $\mat{X}$'s are samples as
|
||||||
\begin{displaymath}
|
% \begin{displaymath}
|
||||||
\mat{X} = \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}, \qquad \vec{\mat{\epsilon}} \sim \mathbb{N}_{p q}(\mat{0}, \mat{\Delta})
|
% \mat{X} = \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}, \qquad \vec{\mat{\epsilon}} \sim \mathbb{N}_{p q}(\mat{0}, \mat{\Delta})
|
||||||
\end{displaymath}
|
% \end{displaymath}
|
||||||
where $\mat{\Delta}_{i j} = 0.5^{|i - j|}$ for $i, j = 1, ..., p q$.
|
% where $\mat{\Delta}_{i j} = 0.5^{|i - j|}$ for $i, j = 1, ..., p q$.
|
||||||
|
|
||||||
\begin{figure}
|
\begin{table}[!ht]
|
||||||
\centering
|
\centering
|
||||||
\includegraphics{loss_Ex01.png}
|
% see: https://en.wikibooks.org/wiki/LaTeX/Tables
|
||||||
\end{figure}
|
\begin{tabular}{ll|r@{ }l *{3}{r@{.}l}}
|
||||||
\begin{figure}
|
method & init
|
||||||
\centering
|
& \multicolumn{2}{c}{loss}
|
||||||
\includegraphics{estimates_Ex01.png}
|
& \multicolumn{2}{c}{MSE}
|
||||||
\end{figure}
|
& \multicolumn{2}{c}{$\dist(\hat{\mat\alpha}, \mat\alpha)$}
|
||||||
\begin{figure}
|
& \multicolumn{2}{c}{$\dist(\hat{\mat\beta}, \mat\beta)$}
|
||||||
\centering
|
\\ \hline
|
||||||
\includegraphics{Delta_Ex01.png}
|
base & vlp & -2642&(1594) & 1&82 (2.714) & 0&248 (0.447) & 0&271 (0.458) \\
|
||||||
\end{figure}
|
new & vlp & -2704&(1452) & 1&78 (2.658) & 0&233 (0.438) & 0&260 (0.448) \\
|
||||||
\begin{figure}
|
new & ls & -3479& (95) & 0&99 (0.025) & 0&037 (0.017) & 0&035 (0.015) \\
|
||||||
\centering
|
momentum & vlp & -2704&(1452) & 1&78 (2.658) & 0&233 (0.438) & 0&260 (0.448) \\
|
||||||
\includegraphics{hist_Ex01.png}
|
momentum & ls & -3479& (95) & 0&99 (0.025) & 0&037 (0.017) & 0&035 (0.015) \\
|
||||||
\end{figure}
|
approx & vlp & 6819&(1995) & 3&99 (12.256) & 0&267 (0.448) & 0&287 (0.457) \\
|
||||||
|
approx & ls & 5457& (163) & 0&99 (0.025) & 0&033 (0.017) & 0&030 (0.012) \\
|
||||||
|
\end{tabular}
|
||||||
|
\caption{Mean (standard deviation) for simulated runs of $20$ repititions for the model $\mat{X} = \mat{\beta}\mat{f}_y\t{\mat{\alpha}}$ of dimensinos $(p, q) = (11, 7)$, $(k, r) = (3, 5)$ with a sample size of $n = 200$. The covariance structure is $\mat{\Delta} = \mat{\Delta}_2\otimes \mat{\Delta}_1$ for $\Delta_i = \text{AR}(\sqrt{0.5})$, $i = 1, 2$. The functions applied to the standard normal response $y$ are $\sin, \cos$ with increasing frequency.}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
% \begin{figure}
|
||||||
|
% \centering
|
||||||
|
% \includegraphics{loss_Ex01.png}
|
||||||
|
% \end{figure}
|
||||||
|
% \begin{figure}
|
||||||
|
% \centering
|
||||||
|
% \includegraphics{estimates_Ex01.png}
|
||||||
|
% \end{figure}
|
||||||
|
% \begin{figure}
|
||||||
|
% \centering
|
||||||
|
% \includegraphics{Delta_Ex01.png}
|
||||||
|
% \end{figure}
|
||||||
|
% \begin{figure}
|
||||||
|
% \centering
|
||||||
|
% \includegraphics{hist_Ex01.png}
|
||||||
|
% \end{figure}
|
||||||
|
|
||||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
%%% Bib and Index %%%
|
%%% Bib and Index %%%
|
||||||
|
@ -707,41 +753,33 @@ Therefore, we use $\mat{A}, \mat{B}, \mat{X}, \mat{F}, \mat{R}, ...$ for matrice
|
||||||
\ten{T}\ttm[2]\mat{B}\ &{\widehat{=}}\ \mat{B}\ten{T}_{(2)} & \ten{T}(n, p, q), \ten{T}_{(2)}(p, n q), \mat{B}(q, p)
|
\ten{T}\ttm[2]\mat{B}\ &{\widehat{=}}\ \mat{B}\ten{T}_{(2)} & \ten{T}(n, p, q), \ten{T}_{(2)}(p, n q), \mat{B}(q, p)
|
||||||
\end{align*}
|
\end{align*}
|
||||||
|
|
||||||
\section{Matrix Valued Normal Distribution}
|
% \section{Matrix Valued Normal Distribution}
|
||||||
A random variable $\mat{X}$ of dimensions $p\times q$ is \emph{Matrix-Valued Normal Distribution}, denoted
|
% A random variable $\mat{X}$ of dimensions $p\times q$ is \emph{Matrix-Valued Normal Distribution}, denoted
|
||||||
\begin{displaymath}
|
% \begin{displaymath}
|
||||||
\mat{X}\sim\mathcal{MN}_{p\times q}(\mat{\mu}, \mat{\Delta}_2, \mat{\Delta}_1),
|
% \mat{X}\sim\mathcal{MN}_{p\times q}(\mat{\mu}, \mat{\Delta}_2, \mat{\Delta}_1),
|
||||||
\end{displaymath}
|
% \end{displaymath}
|
||||||
if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otimes\mat\Delta_2)$. Note the order of the covariance matrices $\mat\Delta_1, \mat\Delta_2$. Its density is given by
|
% if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otimes\mat\Delta_2)$. Note the order of the covariance matrices $\mat\Delta_1, \mat\Delta_2$. Its density is given by
|
||||||
\begin{displaymath}
|
% \begin{displaymath}
|
||||||
f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right).
|
% f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right).
|
||||||
\end{displaymath}
|
% \end{displaymath}
|
||||||
|
|
||||||
\section{Sampling form a Multi-Array Normal Distribution}
|
% \section{Sampling form a Multi-Array Normal Distribution}
|
||||||
Let $\ten{X}$ be an order (rank) $r$ Multi-Array random variable of dimensions $p_1\times...\times p_r$ following a Multi-Array (or Tensor) Normal distributed
|
% Let $\ten{X}$ be an order (rank) $r$ Multi-Array random variable of dimensions $p_1\times...\times p_r$ following a Multi-Array (or Tensor) Normal distributed
|
||||||
\begin{displaymath}
|
% \begin{displaymath}
|
||||||
\ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
|
% \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
|
||||||
\end{displaymath}
|
% \end{displaymath}
|
||||||
Its density is given by
|
% Its density is given by
|
||||||
\begin{displaymath}
|
% \begin{displaymath}
|
||||||
f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1}
|
% f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1}
|
||||||
\exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
|
% \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
|
||||||
\end{displaymath}
|
% \end{displaymath}
|
||||||
with $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
|
% with $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
|
||||||
\begin{displaymath}
|
% \begin{displaymath}
|
||||||
\vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
|
% \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
|
||||||
\end{displaymath}
|
% \end{displaymath}
|
||||||
with $p = \prod_{i = 1}^r p_i$.
|
% with $p = \prod_{i = 1}^r p_i$.
|
||||||
|
|
||||||
When sampling from the Multi-Array Normal one way is to sample from the Multi-Variate Normal and then reshaping the result, but this is usually very inefficient because it requires to store the multi-variate covariance matrix which is very big. Instead, it is more efficient to sample $\ten{Z}$ as a tensor of the same shape as $\ten{X}$ with standard normal entries and then transform the $\ten{Z}$ to follow the Multi-Array Normal as follows
|
% \todo{Check this!!!}
|
||||||
\begin{displaymath}
|
|
||||||
\ten{Z}\sim\mathcal{TN}(0, \mat{I}_{p_1}, ..., \mat{I}_{p_r})
|
|
||||||
\quad\Rightarrow\quad
|
|
||||||
\ten{X} = \ten{Z}\times\{\mat{\Delta}_1^{1/2}, ..., \mat{\Delta}_r^{1/2}\} + \mu\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
|
|
||||||
\end{displaymath}
|
|
||||||
where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal.
|
|
||||||
|
|
||||||
\todo{Check this!!!}
|
|
||||||
|
|
||||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
%%% Reference Summaries %%%
|
%%% Reference Summaries %%%
|
||||||
|
|
|
@ -18,6 +18,7 @@ sim <- function(X, Fy, alpha.true, beta.true, max.iter = 500L) {
|
||||||
# Logger creator
|
# Logger creator
|
||||||
logger <- function(name) {
|
logger <- function(name) {
|
||||||
eval(substitute(function(iter, loss, alpha, beta, ...) {
|
eval(substitute(function(iter, loss, alpha, beta, ...) {
|
||||||
|
tryCatch({
|
||||||
hist[iter + 1L, ] <<- c(
|
hist[iter + 1L, ] <<- c(
|
||||||
iter = iter,
|
iter = iter,
|
||||||
loss = loss,
|
loss = loss,
|
||||||
|
@ -28,7 +29,16 @@ sim <- function(X, Fy, alpha.true, beta.true, max.iter = 500L) {
|
||||||
norm.alpha = norm(alpha, "F"),
|
norm.alpha = norm(alpha, "F"),
|
||||||
norm.beta = norm(beta, "F"),
|
norm.beta = norm(beta, "F"),
|
||||||
mse = mean((X - mlm(Fy, alpha, beta, modes = 3:2))^2)
|
mse = mean((X - mlm(Fy, alpha, beta, modes = 3:2))^2)
|
||||||
)
|
)},
|
||||||
|
error = function(e) {
|
||||||
|
cat("Error in ", name,
|
||||||
|
", dim(alpha): ", dim(alpha),
|
||||||
|
", dim(alpha.true): ", dim(alpha.true),
|
||||||
|
", dim(beta)", dim(beta),
|
||||||
|
", dim(beta.true)", dim(beta.true),
|
||||||
|
"\n")
|
||||||
|
stop(e)
|
||||||
|
})
|
||||||
|
|
||||||
cat(sprintf(
|
cat(sprintf(
|
||||||
"%s(%3d) | l = %-12.4f - dist = %-.4e - alpha(%d, %d) = %-.4e - beta(%d, %d) = %-.4e\n",
|
"%s(%3d) | l = %-12.4f - dist = %-.4e - alpha(%d, %d) = %-.4e - beta(%d, %d) = %-.4e\n",
|
||||||
|
@ -40,7 +50,11 @@ sim <- function(X, Fy, alpha.true, beta.true, max.iter = 500L) {
|
||||||
}
|
}
|
||||||
|
|
||||||
# Initialize logger history targets
|
# Initialize logger history targets
|
||||||
hist.base <- hist.new <- hist.momentum <- hist.approx <- hist.ls <-
|
hist.base <-
|
||||||
|
hist.new.vlp <- hist.new.ls <-
|
||||||
|
hist.ls <-
|
||||||
|
hist.momentum.vlp <- hist.momentum.ls <-
|
||||||
|
hist.approx.vlp <- hist.approx.ls <-
|
||||||
data.frame(iter = seq(0L, max.iter),
|
data.frame(iter = seq(0L, max.iter),
|
||||||
loss = NA, dist = NA, dist.alpha = NA, dist.beta = NA,
|
loss = NA, dist = NA, dist.alpha = NA, dist.beta = NA,
|
||||||
norm.alpha = NA, norm.beta = NA, mse = NA
|
norm.alpha = NA, norm.beta = NA, mse = NA
|
||||||
|
@ -49,53 +63,81 @@ sim <- function(X, Fy, alpha.true, beta.true, max.iter = 500L) {
|
||||||
# Base (old)
|
# Base (old)
|
||||||
kpir.base(X, Fy, max.iter = max.iter, logger = logger("base"))
|
kpir.base(X, Fy, max.iter = max.iter, logger = logger("base"))
|
||||||
|
|
||||||
# New (simple Gradient Descent)
|
# New (simple Gradient Descent, using VLP initialization)
|
||||||
kpir.new(X, Fy, max.iter = max.iter, logger = logger("new"))
|
kpir.new(X, Fy, max.iter = max.iter, init.method = "vlp",
|
||||||
|
logger = logger("new.vlp"))
|
||||||
|
kpir.new(X, Fy, max.iter = max.iter, init.method = "ls",
|
||||||
|
logger = logger("new.ls"))
|
||||||
|
|
||||||
# Least Squares estimate (alternating estimation)
|
# Least Squares estimate (alternating estimation)
|
||||||
kpir.ls(X, Fy, sample.mode = 1L, max.iter = max.iter, logger = logger("ls"))
|
kpir.ls(X, Fy, sample.mode = 1L, max.iter = max.iter, logger = logger("ls"))
|
||||||
|
|
||||||
# Gradient Descent with Nesterov Momentum
|
# Gradient Descent with Nesterov Momentum
|
||||||
kpir.momentum(X, Fy, max.iter = max.iter, logger = logger("momentum"))
|
kpir.momentum(X, Fy, max.iter = max.iter, init.method = "vlp",
|
||||||
|
logger = logger("momentum.vlp"))
|
||||||
|
kpir.momentum(X, Fy, max.iter = max.iter, init.method = "ls",
|
||||||
|
logger = logger("momentum.ls"))
|
||||||
|
|
||||||
# Approximated MLE with Nesterov Momentum
|
# Approximated MLE with Nesterov Momentum
|
||||||
kpir.approx(X, Fy, max.iter = max.iter, logger = logger("approx"))
|
kpir.approx(X, Fy, max.iter = max.iter, init.method = "vlp",
|
||||||
|
logger = logger("approx.vlp"))
|
||||||
|
kpir.approx(X, Fy, max.iter = max.iter, init.method = "ls",
|
||||||
|
logger = logger("approx.ls"))
|
||||||
|
|
||||||
# Add method tags
|
# Add method tags
|
||||||
hist.base$method <- factor("base")
|
hist.base$method <- factor("base")
|
||||||
hist.new$method <- factor("new")
|
hist.new.vlp$method <- factor("new")
|
||||||
|
hist.new.ls$method <- factor("new")
|
||||||
hist.ls$method <- factor("ls")
|
hist.ls$method <- factor("ls")
|
||||||
hist.momentum$method <- factor("momentum")
|
hist.momentum.vlp$method <- factor("momentum")
|
||||||
hist.approx$method <- factor("approx")
|
hist.momentum.ls$method <- factor("momentum")
|
||||||
|
hist.approx.vlp$method <- factor("approx")
|
||||||
|
hist.approx.ls$method <- factor("approx")
|
||||||
|
# Add init. method tag
|
||||||
|
hist.base$init <- factor("vlp")
|
||||||
|
hist.new.vlp$init <- factor("vlp")
|
||||||
|
hist.new.ls$init <- factor("ls")
|
||||||
|
hist.ls$init <- factor("ls")
|
||||||
|
hist.momentum.vlp$init <- factor("vlp")
|
||||||
|
hist.momentum.ls$init <- factor("ls")
|
||||||
|
hist.approx.vlp$init <- factor("vlp")
|
||||||
|
hist.approx.ls$init <- factor("ls")
|
||||||
|
|
||||||
# Combine results and return
|
# Combine results and return
|
||||||
rbind(hist.base, hist.new, hist.momentum, hist.approx, hist.ls)
|
rbind(
|
||||||
|
hist.base,
|
||||||
|
hist.new.vlp, hist.new.ls,
|
||||||
|
hist.ls,
|
||||||
|
hist.momentum.vlp, hist.momentum.ls,
|
||||||
|
hist.approx.vlp, hist.approx.ls
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
## Plot helper function
|
## Plot helper function
|
||||||
plot.hist2 <- function(hist, response, type = "all", ...) {
|
plot.hist2 <- function(hist, response, type = "all", ...) {
|
||||||
# Extract final results from history
|
# Extract final results from history
|
||||||
sub <- na.omit(hist[c("iter", response, "method", "repetition")])
|
sub <- na.omit(hist[c("iter", response, "method", "init", "repetition")])
|
||||||
sub <- aggregate(sub, list(sub$method, sub$repetition), tail, 1)
|
sub <- aggregate(sub, list(sub$method, sub$init, sub$repetition), tail, 1)
|
||||||
|
|
||||||
# Setup ggplot
|
# Setup ggplot
|
||||||
p <- ggplot(hist, aes_(x = quote(iter),
|
p <- ggplot(hist, aes_(x = quote(iter),
|
||||||
y = as.name(response),
|
y = as.name(response),
|
||||||
color = quote(method),
|
color = quote(method),
|
||||||
group = quote(interaction(method, repetition))))
|
linetype = quote(init),
|
||||||
|
group = quote(interaction(method, repetition, init))))
|
||||||
# Add requested layers
|
# Add requested layers
|
||||||
if (type == "all") {
|
if (type == "all") {
|
||||||
p <- p + geom_line(na.rm = TRUE)
|
p <- p + geom_line(na.rm = TRUE)
|
||||||
p <- p + geom_point(data = sub)
|
p <- p + geom_point(data = sub)
|
||||||
} else if (type == "mean") {
|
} else if (type == "mean") {
|
||||||
p <- p + geom_line(alpha = 0.5, na.rm = TRUE, linetype = "dotted")
|
p <- p + geom_line(alpha = 0.4, na.rm = TRUE, linetype = "dotted")
|
||||||
p <- p + geom_point(data = sub, alpha = 0.5)
|
p <- p + geom_point(data = sub, alpha = 0.4)
|
||||||
p <- p + geom_line(aes(group = method),
|
p <- p + geom_line(aes(group = interaction(method, init)),
|
||||||
stat = "summary", fun = "mean", na.rm = TRUE)
|
stat = "summary", fun = "mean", na.rm = TRUE)
|
||||||
} else if (type == "median") {
|
} else if (type == "median") {
|
||||||
p <- p + geom_line(alpha = 0.5, na.rm = TRUE, linetype = "dotted")
|
p <- p + geom_line(alpha = 0.4, na.rm = TRUE, linetype = "dotted")
|
||||||
p <- p + geom_point(data = sub, alpha = 0.5)
|
p <- p + geom_point(data = sub, alpha = 0.4)
|
||||||
p <- p + geom_line(aes(group = method),
|
p <- p + geom_line(aes(group = interaction(method, init)),
|
||||||
stat = "summary", fun = "median", na.rm = TRUE)
|
stat = "summary", fun = "median", na.rm = TRUE)
|
||||||
}
|
}
|
||||||
# return with theme and annotations
|
# return with theme and annotations
|
||||||
|
@ -182,7 +224,7 @@ max.iter <- 2
|
||||||
Delta.1 <- sqrt(0.5)^abs(outer(seq_len(p), seq_len(p), `-`))
|
Delta.1 <- sqrt(0.5)^abs(outer(seq_len(p), seq_len(p), `-`))
|
||||||
Delta.2 <- sqrt(0.5)^abs(outer(seq_len(q), seq_len(q), `-`))
|
Delta.2 <- sqrt(0.5)^abs(outer(seq_len(q), seq_len(q), `-`))
|
||||||
for (rep in 1:reps) {
|
for (rep in 1:reps) {
|
||||||
cat(sprintf("%4d / %d simulation rep. started\n", rep, reps))
|
cat(sprintf("\n\033[1m%4d / %d simulation rep. started\033[0m\n", rep, reps))
|
||||||
|
|
||||||
alpha.1.true <- alpha.1 <- matrix(rnorm(q * r), q, r)
|
alpha.1.true <- alpha.1 <- matrix(rnorm(q * r), q, r)
|
||||||
alpha.2.true <- alpha.2 <- matrix(rnorm(p * k), p, k)
|
alpha.2.true <- alpha.2 <- matrix(rnorm(p * k), p, k)
|
||||||
|
@ -194,10 +236,10 @@ for (rep in 1:reps) {
|
||||||
head(rep(c(0, pi / 2), ceiling(0.5 * k * r)), k * r)
|
head(rep(c(0, pi / 2), ceiling(0.5 * k * r)), k * r)
|
||||||
))
|
))
|
||||||
dim(Fy) <- c(n, k, r)
|
dim(Fy) <- c(n, k, r)
|
||||||
X <- mlm(Fy, alpha, beta, modes = 3:2)
|
X <- mlm(Fy, alpha.1, alpha.2, modes = 3:2)
|
||||||
X <- X + rtensornorm(n, 0, Delta.1, Delta.2, sample.mode = 1L)
|
X <- X + rtensornorm(n, 0, Delta.1, Delta.2, sample.mode = 1L)
|
||||||
|
|
||||||
hist.sim <- sim(X, Fy, alpha.true, beta.true, max.iter = max.iter)
|
hist.sim <- sim(X, Fy, alpha.1.true, alpha.2.true, max.iter = max.iter)
|
||||||
hist.sim$repetition <- rep
|
hist.sim$repetition <- rep
|
||||||
|
|
||||||
hist <- rbind(hist, hist.sim)
|
hist <- rbind(hist, hist.sim)
|
||||||
|
@ -213,17 +255,47 @@ hist$repetition <- factor(hist$repetition)
|
||||||
|
|
||||||
for (response in c("loss", "mse", "dist", "dist.alpha", "dist.beta")) {
|
for (response in c("loss", "mse", "dist", "dist.alpha", "dist.beta")) {
|
||||||
for (fun in c("all", "mean", "median")) {
|
for (fun in c("all", "mean", "median")) {
|
||||||
print(plot.hist2(hist, response, fun, title = fun) + coord_trans(x = "log1p"))
|
title <- paste(fun, paste(c("n", "p", "q", "k", "r"), c(n, p, q, k, r), sep = "=", collapse = ", "))
|
||||||
|
print(plot.hist2(hist, response, fun, title = title) +
|
||||||
|
coord_trans(x = "log1p"))
|
||||||
dev.print(png, file = sprintf("%s_%s_%s_%s.png", sim.name, datetime, response, fun),
|
dev.print(png, file = sprintf("%s_%s_%s_%s.png", sim.name, datetime, response, fun),
|
||||||
width = 768, height = 768, res = 125)
|
width = 768, height = 768, res = 125)
|
||||||
if (response != "loss") {
|
if (response != "loss") {
|
||||||
print(plot.hist2(hist, response, fun, title = fun) + coord_trans(x = "log1p", y = "log1p"))
|
print(plot.hist2(hist, response, fun, title = title) +
|
||||||
|
coord_trans(x = "log1p", y = "log1p"))
|
||||||
dev.print(png, file = sprintf("%s_%s_%s_%s_log.png", sim.name, datetime, response, fun),
|
dev.print(png, file = sprintf("%s_%s_%s_%s_log.png", sim.name, datetime, response, fun),
|
||||||
width = 768, height = 768, res = 125)
|
width = 768, height = 768, res = 125)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
stats <- local({
|
||||||
|
# final result from history
|
||||||
|
sub <- na.omit(hist)
|
||||||
|
sub <- aggregate(sub, list(
|
||||||
|
method = sub$method, init = sub$init, repetition = sub$repetition
|
||||||
|
), tail, 1)
|
||||||
|
|
||||||
|
# aggregate statistics over repetitions
|
||||||
|
stats.mean <- aggregate(subset(sub, select = c("loss", "mse", "dist.alpha", "dist.beta")),
|
||||||
|
list(method = sub$method, init = sub$init), mean)
|
||||||
|
stats.sd <- aggregate(subset(sub, select = c("loss", "mse", "dist.alpha", "dist.beta")),
|
||||||
|
list(method = sub$method, init = sub$init), sd)
|
||||||
|
|
||||||
|
# merge mean and sd stats together
|
||||||
|
merge(stats.mean, stats.sd, by = c("method", "init"), suffixes = c(".mean", ".sd"))
|
||||||
|
})
|
||||||
|
print(stats, digits = 2)
|
||||||
|
# method init loss.mean mse.mean dist.alpha.mean dist.beta.mean loss.sd mse.sd dist.alpha.sd dist.beta.sd
|
||||||
|
# 1 approx ls 5457 0.99 0.033 0.030 163 0.025 0.017 0.012
|
||||||
|
# 2 approx vlp 6819 3.99 0.267 0.287 1995 12.256 0.448 0.457
|
||||||
|
# 3 base vlp -2642 1.82 0.248 0.271 1594 2.714 0.447 0.458
|
||||||
|
# 4 momentum ls -3479 0.99 0.037 0.035 95 0.025 0.017 0.015
|
||||||
|
# 5 momentum vlp -2704 1.78 0.233 0.260 1452 2.658 0.438 0.448
|
||||||
|
# 6 new ls -3479 0.99 0.037 0.035 95 0.025 0.017 0.015
|
||||||
|
# 7 new vlp -2704 1.78 0.233 0.260 1452 2.658 0.438 0.448
|
||||||
|
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
### Sim 3 ###
|
### Sim 3 ###
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
|
@ -15,7 +15,6 @@ export(dist.projection)
|
||||||
export(dist.subspace)
|
export(dist.subspace)
|
||||||
export(kpir.approx)
|
export(kpir.approx)
|
||||||
export(kpir.base)
|
export(kpir.base)
|
||||||
export(kpir.kron)
|
|
||||||
export(kpir.ls)
|
export(kpir.ls)
|
||||||
export(kpir.momentum)
|
export(kpir.momentum)
|
||||||
export(kpir.new)
|
export(kpir.new)
|
||||||
|
|
|
@ -84,7 +84,9 @@
|
||||||
#' Reduction and Variable Selection" By Xin Chen, Changliang Zou and
|
#' Reduction and Variable Selection" By Xin Chen, Changliang Zou and
|
||||||
#' R. Dennis Cook.
|
#' R. Dennis Cook.
|
||||||
#'
|
#'
|
||||||
#' @suggest RSpectra
|
#' @note for speed reasons this functions attempts to use
|
||||||
|
#' \code{\link[RSpectra]{eigs_sym}} if \pkg{\link{RSpectra}} is installed,
|
||||||
|
#' otherwise \code{\link{eigen}} is used which might be significantly slower.
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
CISE <- function(M, N, d = 1L, method = "PFC", max.iter = 100L, Theta = NULL,
|
CISE <- function(M, N, d = 1L, method = "PFC", max.iter = 100L, Theta = NULL,
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||||
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
|
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
|
||||||
nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)),
|
nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)),
|
||||||
|
max.init.iter = 20L, init.method = c("ls", "vlp"),
|
||||||
eps = .Machine$double.eps,
|
eps = .Machine$double.eps,
|
||||||
logger = NULL
|
logger = NULL
|
||||||
) {
|
) {
|
||||||
|
@ -17,7 +18,7 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||||
stopifnot(nrow(X) == NROW(Fy))
|
stopifnot(nrow(X) == NROW(Fy))
|
||||||
n <- nrow(X) # Number of observations
|
n <- nrow(X) # Number of observations
|
||||||
|
|
||||||
# Get and check predictor dimensions
|
# Check predictor dimensions
|
||||||
if (length(dim(X)) == 2L) {
|
if (length(dim(X)) == 2L) {
|
||||||
stopifnot(!missing(shape))
|
stopifnot(!missing(shape))
|
||||||
stopifnot(ncol(X) == prod(shape[1:2]))
|
stopifnot(ncol(X) == prod(shape[1:2]))
|
||||||
|
@ -26,18 +27,16 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||||
} else if (length(dim(X)) == 3L) {
|
} else if (length(dim(X)) == 3L) {
|
||||||
p <- dim(X)[2]
|
p <- dim(X)[2]
|
||||||
q <- dim(X)[3]
|
q <- dim(X)[3]
|
||||||
dim(X) <- c(n, p * q)
|
|
||||||
} else {
|
} else {
|
||||||
stop("'X' must be a matrix or 3-tensor")
|
stop("'X' must be a matrix or 3-tensor")
|
||||||
}
|
}
|
||||||
|
|
||||||
# Get and check response dimensions
|
# Check response dimensions
|
||||||
if (!is.array(Fy)) {
|
if (!is.array(Fy)) {
|
||||||
Fy <- as.array(Fy)
|
Fy <- as.array(Fy)
|
||||||
}
|
}
|
||||||
if (length(dim(Fy)) == 1L) {
|
if (length(dim(Fy)) == 1L) {
|
||||||
k <- r <- 1L
|
k <- r <- 1L
|
||||||
dim(Fy) <- c(n, 1L)
|
|
||||||
} else if (length(dim(Fy)) == 2L) {
|
} else if (length(dim(Fy)) == 2L) {
|
||||||
stopifnot(!missing(shape))
|
stopifnot(!missing(shape))
|
||||||
stopifnot(ncol(Fy) == prod(shape[3:4]))
|
stopifnot(ncol(Fy) == prod(shape[3:4]))
|
||||||
|
@ -46,17 +45,26 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||||
} else if (length(dim(Fy)) == 3L) {
|
} else if (length(dim(Fy)) == 3L) {
|
||||||
k <- dim(Fy)[2]
|
k <- dim(Fy)[2]
|
||||||
r <- dim(Fy)[3]
|
r <- dim(Fy)[3]
|
||||||
dim(Fy) <- c(n, k * r)
|
|
||||||
} else {
|
} else {
|
||||||
stop("'Fy' must be a vector, matrix or 3-tensor")
|
stop("'Fy' must be a vector, matrix or 3-tensor")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
|
### Step 1: (Approx) Least Squares initial estimate
|
||||||
|
init.method <- match.arg(init.method)
|
||||||
|
if (init.method == "ls") {
|
||||||
|
# De-Vectroize (from now on tensor arithmetics)
|
||||||
|
dim(Fy) <- c(n, k, r)
|
||||||
|
dim(X) <- c(n, p, q)
|
||||||
|
|
||||||
|
ls <- kpir.ls(X, Fy, max.iter = max.init.iter, sample.mode = 1L, eps = eps)
|
||||||
|
c(beta0, alpha0) %<-% ls$alphas
|
||||||
|
} else { # Van Loan and Pitsianis
|
||||||
# Vectorize
|
# Vectorize
|
||||||
dim(Fy) <- c(n, k * r)
|
dim(Fy) <- c(n, k * r)
|
||||||
dim(X) <- c(n, p * q)
|
dim(X) <- c(n, p * q)
|
||||||
# Solve
|
|
||||||
|
# solution for `X = Fy B' + epsilon`
|
||||||
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
|
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
|
||||||
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
||||||
# In case of under-determined system replace the inverse in the normal
|
# In case of under-determined system replace the inverse in the normal
|
||||||
|
@ -73,6 +81,7 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||||
|
|
||||||
# Decompose `B = alpha x beta` into `alpha` and `beta`
|
# Decompose `B = alpha x beta` into `alpha` and `beta`
|
||||||
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
|
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
|
||||||
|
}
|
||||||
|
|
||||||
# Compute residuals
|
# Compute residuals
|
||||||
R <- X - (Fy %x_3% alpha0 %x_2% beta0)
|
R <- X - (Fy %x_3% alpha0 %x_2% beta0)
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||||
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
|
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
|
||||||
nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)),
|
nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)),
|
||||||
|
max.init.iter = 20L, init.method = c("ls", "vlp"),
|
||||||
eps = .Machine$double.eps,
|
eps = .Machine$double.eps,
|
||||||
logger = NULL
|
logger = NULL
|
||||||
) {
|
) {
|
||||||
|
@ -48,7 +49,17 @@ kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
|
### Step 1: (Approx) Least Squares initial estimate
|
||||||
|
init.method <- match.arg(init.method)
|
||||||
|
if (init.method == "ls") {
|
||||||
|
dim(X) <- c(n, p, q)
|
||||||
|
dim(Fy) <- c(n, k, r)
|
||||||
|
ls <- kpir.ls(X, Fy, max.iter = max.init.iter, sample.mode = 1L, eps = eps)
|
||||||
|
c(beta0, alpha0) %<-% ls$alphas
|
||||||
|
dim(X) <- c(n, p * q)
|
||||||
|
dim(Fy) <- c(n, k * r)
|
||||||
|
} else { # Van Loan and Pitsianis
|
||||||
|
# solution for `X = Fy B' + epsilon`
|
||||||
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
|
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
|
||||||
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
||||||
# In case of under-determined system replace the inverse in the normal
|
# In case of under-determined system replace the inverse in the normal
|
||||||
|
@ -61,6 +72,7 @@ kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||||
|
|
||||||
# Decompose `B = alpha x beta` into `alpha` and `beta`
|
# Decompose `B = alpha x beta` into `alpha` and `beta`
|
||||||
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
|
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
|
||||||
|
}
|
||||||
|
|
||||||
# Compute residuals
|
# Compute residuals
|
||||||
resid <- X - tcrossprod(Fy, kronecker(alpha0, beta0))
|
resid <- X - tcrossprod(Fy, kronecker(alpha0, beta0))
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
#' @export
|
#' @export
|
||||||
kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||||
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
|
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
|
||||||
|
max.init.iter = 20L, init.method = c("ls", "vlp"),
|
||||||
eps = .Machine$double.eps,
|
eps = .Machine$double.eps,
|
||||||
logger = NULL
|
logger = NULL
|
||||||
) {
|
) {
|
||||||
|
@ -45,7 +46,17 @@ kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||||
stop("'Fy' must be a vector, matrix or 3-tensor")
|
stop("'Fy' must be a vector, matrix or 3-tensor")
|
||||||
}
|
}
|
||||||
|
|
||||||
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
|
### Step 1: (Approx) Least Squares initial estimate
|
||||||
|
init.method <- match.arg(init.method)
|
||||||
|
if (init.method == "ls") {
|
||||||
|
dim(X) <- c(n, p, q)
|
||||||
|
dim(Fy) <- c(n, k, r)
|
||||||
|
ls <- kpir.ls(X, Fy, max.iter = max.init.iter, sample.mode = 1L, eps = eps)
|
||||||
|
c(beta, alpha) %<-% ls$alphas
|
||||||
|
dim(X) <- c(n, p * q)
|
||||||
|
dim(Fy) <- c(n, k * r)
|
||||||
|
} else { # Van Loan and Pitsianis
|
||||||
|
# solution for `X = Fy B' + epsilon`
|
||||||
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
|
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
|
||||||
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
||||||
# In case of under-determined system replace the inverse in the normal
|
# In case of under-determined system replace the inverse in the normal
|
||||||
|
@ -58,6 +69,7 @@ kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||||
|
|
||||||
# Decompose `B = alpha x beta` into `alpha` and `beta`
|
# Decompose `B = alpha x beta` into `alpha` and `beta`
|
||||||
c(alpha, beta) %<-% approx.kronecker(B, c(q, r), c(p, k))
|
c(alpha, beta) %<-% approx.kronecker(B, c(q, r), c(p, k))
|
||||||
|
}
|
||||||
|
|
||||||
# Compute residuals
|
# Compute residuals
|
||||||
resid <- X - tcrossprod(Fy, kronecker(alpha, beta))
|
resid <- X - tcrossprod(Fy, kronecker(alpha, beta))
|
||||||
|
|
Loading…
Reference in New Issue