diff --git a/.gitignore b/.gitignore index 5768bbd..f5e8f7e 100644 --- a/.gitignore +++ b/.gitignore @@ -81,8 +81,12 @@ vignettes/*.pdf *.RData *.Rdata +# R Profiling +*.Rprof + # VSCode configuration .vscode/ +.lintr ## Archives, compressed files/folders # Output files from R CMD build @@ -104,3 +108,8 @@ wip/ mlda_analysis/ References/ + +# Images (except images used in LaTeX) +*.png +*.svg +!**/LaTeX/*.png diff --git a/LaTeX/main.tex b/LaTeX/main.tex index d0bda6c..fa24d08 100644 --- a/LaTeX/main.tex +++ b/LaTeX/main.tex @@ -49,6 +49,7 @@ \newcommand{\mat}[1]{\boldsymbol{#1}} \newcommand{\ten}[1]{\mathcal{#1}} \renewcommand{\vec}{\operatorname{vec}} +\newcommand{\dist}{\operatorname{dist}} \DeclareMathOperator{\kron}{\otimes} % Kronecker Product \DeclareMathOperator{\hada}{\odot} % Hadamard Product \newcommand{\ttm}[1][n]{\times_{#1}} % n-mode product (Tensor Times Matrix) @@ -62,8 +63,8 @@ \DeclareMathOperator*{\argmax}{{arg\,max}} \newcommand{\D}{\textnormal{D}} \renewcommand{\d}{\textnormal{d}} -\renewcommand{\t}[1]{{{#1}'}} -\newcommand{\pinv}[1]{{{#1}^{\dagger}}} % `Moore-Penrose pseudoinverse` +\renewcommand{\t}[1]{{#1^{\prime}}} +\newcommand{\pinv}[1]{{#1^{\dagger}}} % `Moore-Penrose pseudoinverse` \newcommand{\todo}[1]{{\color{red}TODO: #1}} % \DeclareFontFamily{U}{mathx}{\hyphenchar\font45} @@ -117,17 +118,15 @@ Let $\ten{X}$ be a multi-dimensional array random variable of order (rank) $r$ w \end{displaymath} Its density is given by \begin{displaymath} - f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1} + f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{p_{-i}}} \Big)^{-1} \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right) \end{displaymath} -where $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution +where $p_{\lnot i} = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution \begin{displaymath} \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1) \end{displaymath} with $p = \prod_{i = 1}^r p_i$. - - \begin{theorem}[Tensor Normal to Multi-Variate Normal equivalence] For a multi-dimensional random variable $\ten{X}$ of order $r$ with dimensions $p_1\times ..., p_r$. Let $\ten{\mu}$ be the mean of the same order and dimensions as $\ten{X}$ and the mode covariance matrices $\mat{\Delta}_i$ of dimensions $p_i\times p_i$ for $i = 1, ..., n$. Then the tensor normal distribution is equivalent to the multi-variate normal distribution by the relation \begin{displaymath} @@ -148,12 +147,12 @@ with $p = \prod_{i = 1}^r p_i$. Next, using a property of the determinant of a Kronecker product $|\mat{\Delta}_1\otimes\mat{\Delta}_2| = |\mat{\Delta}_1|^{p_2}|\mat{\Delta}_2|^{p_1}$ yields \begin{displaymath} |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1| - = |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_2|^{p_1}|\mat{\Delta}_1|^{q_1} + = |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_2|^{p_1}|\mat{\Delta}_1|^{p_{\lnot 1}} \end{displaymath} - where $q_i = \prod_{j \neq i}p_j$. By induction over $r$ the relation + where $p_{\lnot i} = \prod_{j \neq i}p_j$. By induction over $r$ the relation \begin{displaymath} |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1| - = \prod_{i = 1}^r |\mat{\Delta}_i|^{q_i} + = \prod_{i = 1}^r |\mat{\Delta}_i|^{p_{\lnot i}} \end{displaymath} holds for arbitrary order $r$. Substituting into the Tensor Normal density leads to \begin{align*} @@ -163,6 +162,14 @@ with $p = \prod_{i = 1}^r p_i$. which is the Multi-Variate Normal density of the $p$ dimensional vector $\vec\ten{X}$. \end{proof} +When sampling from the Multi-Array Normal one way is to sample from the Multi-Variate Normal and then reshaping the result, but this is usually very inefficient because it requires to store the multi-variate covariance matrix which is very big. Instead, it is more efficient to sample $\ten{Z}$ as a tensor of the same shape as $\ten{X}$ with standard normal entries and then transform the $\ten{Z}$ to follow the Multi-Array Normal as follows +\begin{displaymath} + \ten{Z}\sim\mathcal{TN}(0, \mat{I}_{p_1}, ..., \mat{I}_{p_r}) + \quad\Rightarrow\quad + \ten{X} = \ten{Z}\times\{\mat{\Delta}_1^{1/2}, ..., \mat{\Delta}_r^{1/2}\} + \mu\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r). +\end{displaymath} +where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal. + \section{Introduction} We assume the model @@ -385,7 +392,7 @@ The unbiasednes comes directly from the following short computation; = \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p (\mat{\Delta}_{2})_{l,l}(\mat{\Delta}_{1})_{j,k} = (\mat\Delta_1\tr(\mat\Delta_2))_{j,k}. \end{displaymath} -which means that $\E\widetilde{\mat\Delta}_1 = \mat\Delta_1\tr(\mat\Delta_2)$ and in analogy $\E\widetilde{\mat\Delta}_2 = \mat\Delta_2\tr(\mat\Delta_1)$. Now, we need to handle the scaling which can be estimated unbiased by +which means that $\E\widetilde{\mat\Delta}_1 = \mat\Delta_1\tr(\mat\Delta_2)$ and in analogy $\E\widetilde{\mat\Delta}_2 = \mat\Delta_2\tr(\mat\Delta_1)$. Now, we need to handle the scaling which can be estimated unbiasedly by \begin{displaymath} \tilde{s} = \frac{1}{n}\sum_{i = 1}^n \|\mat{R}_i\|_F^2 \end{displaymath} @@ -481,7 +488,7 @@ Putting it all together &\hspace{3em}\hspace{3em} \hspace{4.7em} - \tilde{s}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1} \t{\mat{R}_i} \widetilde{\mat{\Delta}}_2^{-1}) \Big)\d\mat{\beta}\bigg). \end{align*} -Observe that the bracketed expressions before $\d\mat{\alpha}$ and $\d\mat{\beta}$ are transposed of each other. Lets denote the expression for $\d\mat{\alpha}$ as $\mat{G}_i$ which has the form +Observe that the bracketed expressions before $\d\mat{\alpha}$ and $\d\mat{\beta}$ are transposes. Lets denote the expression for $\d\mat{\alpha}$ as $\mat{G}_i$ which has the form \begin{displaymath} \mat{G}_i = (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\mat{R}_i @@ -521,7 +528,7 @@ Let $\ten{X}, \ten{F}$ be order (rank) $r$ tensors of dimensions $p_1\times ... \begin{displaymath} \ten{X}_i = \ten{\mu} + \ten{F}_i\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}_i \end{displaymath} -where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}\sim\mathcal{TM}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations +where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations \begin{displaymath} \ten{X} = \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon} \end{displaymath} @@ -529,48 +536,87 @@ which is almost identical as the observations $\ten{X}_i, \ten{F}_i$ are stacked In the following we assume w.l.o.g that $\ten{\mu} = 0$, as if this is not true we simply replace $\ten{X}_i$ with $\ten{X}_i - \ten{\mu}$ for $i = 1, ..., n$ before collecting all the observations in the response tensor $\ten{X}$. -The goal here is to find reasonable estimates for $\mat{\alpha}_i$, $i = 1, ..., n$ for the mean model +The goal here is to find reasonable estimates for $\mat{\alpha}_j$, $j = 1, ..., n$ for the mean model \begin{displaymath} \E \ten{X}|\ten{F}, \mat{\alpha}_1, ..., \mat{\alpha}_r = \ten{F}\times\{\mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n\} - = \ten{F}\times_{i\in[r]}\mat{\alpha}_i. + = \ten{F}\times_{j\in[r]}\mat{\alpha}_j. \end{displaymath} Under the mean model we have using the general mode product relation $(\ten{A}\times_j\mat{B})_{(j)} = \mat{B}\ten{A}_{(j)}$ we get \begin{align*} \ten{X}_{(j)}\t{\ten{X}_{(j)}} \overset{\text{SVD}}{=} \mat{U}_j\mat{D}_j\t{\mat{U}_j} - = \mat{\alpha}_j(\ten{F}\times_{i\in[r]\backslash j}\mat{\alpha}_i)_{(j)} - \t{(\ten{F}\times_{i\in[r]\backslash j}\mat{\alpha}_i)_{(j)}}\t{\mat{\alpha}_j} + = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)} + \t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}\t{\mat{\alpha}_j} \end{align*} for the $j = 1, ..., r$ modes. Using this relation we construct an iterative estimation process by setting the initial estimates of $\hat{\mat{\alpha}}_j^{(0)} = \mat{U}_j[, 1:q_j]$ which are the first $q_j$ columns of $\mat{U}_j$. -\todo{continue} +For getting least squares estimates for $\mat{\alpha}_j$, $j = 1, ..., r$ we observe that by matricization of the mean model +\begin{displaymath} + \ten{X}_{(j)} = (\ten{F}\times_{k\in[r]}\mat{\alpha}_k)_{(j)} = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)} +\end{displaymath} +leads to normal equations for each $\mat{\alpha}_j$, $j = 1, ..., r$ +\begin{displaymath} + \ten{X}_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} +\end{displaymath} +where the normal equations for $\mat{\alpha}_j$ depend on all the other $\mat{\alpha}_k$. With the initial estimates from above this allows an alternating approach. Index with $t = 1, ...$ the current iteration, then a new estimate $\widehat{\mat{\alpha}}_j^{(t)}$ given the previous estimates $\widehat{\mat{\alpha}}_k^{(t-1)}$, $k = 1, ..., r$ is computed as +\begin{displaymath} + \widehat{\mat{\alpha}}_j^{(t)} = + \ten{X}_{(j)} + \t{\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}} + \left( + \big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)} + \t{\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}} + \right)^{-1} +\end{displaymath} +for $j = 1, ..., r$ until convergence or a maximum number of iterations is exceeded. The final estimates are the least squares estimates by this procedure. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Numerical Examples %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Numerical Examples} -The first example (which by it self is \emph{not} exemplary) is the estimation with parameters $n = 200$, $p = 11$, $q = 5$, $k = 14$ and $r = 9$. The ``true'' matrices $\mat\alpha$, $\mat\beta$ generated by sampling there elements i.i.d. standard normal like the responses $y$. Then, for each observation, $\mat{f}_y$ is computed as $\sin(s_{i, j} y + o_{i j})$ \todo{ properly describe} to fill the elements of $\mat{f}_y$. Then the $\mat{X}$'s are samples as -\begin{displaymath} - \mat{X} = \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}, \qquad \vec{\mat{\epsilon}} \sim \mathbb{N}_{p q}(\mat{0}, \mat{\Delta}) -\end{displaymath} -where $\mat{\Delta}_{i j} = 0.5^{|i - j|}$ for $i, j = 1, ..., p q$. +% The first example (which by it self is \emph{not} exemplary) is the estimation with parameters $n = 200$, $p = 11$, $q = 5$, $k = 14$ and $r = 9$. The ``true'' matrices $\mat\alpha$, $\mat\beta$ generated by sampling there elements i.i.d. standard normal like the responses $y$. Then, for each observation, $\mat{f}_y$ is computed as $\sin(s_{i, j} y + o_{i j})$ \todo{ properly describe} to fill the elements of $\mat{f}_y$. Then the $\mat{X}$'s are samples as +% \begin{displaymath} +% \mat{X} = \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}, \qquad \vec{\mat{\epsilon}} \sim \mathbb{N}_{p q}(\mat{0}, \mat{\Delta}) +% \end{displaymath} +% where $\mat{\Delta}_{i j} = 0.5^{|i - j|}$ for $i, j = 1, ..., p q$. -\begin{figure} +\begin{table}[!ht] \centering - \includegraphics{loss_Ex01.png} -\end{figure} -\begin{figure} - \centering - \includegraphics{estimates_Ex01.png} -\end{figure} -\begin{figure} - \centering - \includegraphics{Delta_Ex01.png} -\end{figure} -\begin{figure} - \centering - \includegraphics{hist_Ex01.png} -\end{figure} + % see: https://en.wikibooks.org/wiki/LaTeX/Tables + \begin{tabular}{ll|r@{ }l *{3}{r@{.}l}} + method & init + & \multicolumn{2}{c}{loss} + & \multicolumn{2}{c}{MSE} + & \multicolumn{2}{c}{$\dist(\hat{\mat\alpha}, \mat\alpha)$} + & \multicolumn{2}{c}{$\dist(\hat{\mat\beta}, \mat\beta)$} + \\ \hline + base & vlp & -2642&(1594) & 1&82 (2.714) & 0&248 (0.447) & 0&271 (0.458) \\ + new & vlp & -2704&(1452) & 1&78 (2.658) & 0&233 (0.438) & 0&260 (0.448) \\ + new & ls & -3479& (95) & 0&99 (0.025) & 0&037 (0.017) & 0&035 (0.015) \\ + momentum & vlp & -2704&(1452) & 1&78 (2.658) & 0&233 (0.438) & 0&260 (0.448) \\ + momentum & ls & -3479& (95) & 0&99 (0.025) & 0&037 (0.017) & 0&035 (0.015) \\ + approx & vlp & 6819&(1995) & 3&99 (12.256) & 0&267 (0.448) & 0&287 (0.457) \\ + approx & ls & 5457& (163) & 0&99 (0.025) & 0&033 (0.017) & 0&030 (0.012) \\ + \end{tabular} + \caption{Mean (standard deviation) for simulated runs of $20$ repititions for the model $\mat{X} = \mat{\beta}\mat{f}_y\t{\mat{\alpha}}$ of dimensinos $(p, q) = (11, 7)$, $(k, r) = (3, 5)$ with a sample size of $n = 200$. The covariance structure is $\mat{\Delta} = \mat{\Delta}_2\otimes \mat{\Delta}_1$ for $\Delta_i = \text{AR}(\sqrt{0.5})$, $i = 1, 2$. The functions applied to the standard normal response $y$ are $\sin, \cos$ with increasing frequency.} +\end{table} + +% \begin{figure} +% \centering +% \includegraphics{loss_Ex01.png} +% \end{figure} +% \begin{figure} +% \centering +% \includegraphics{estimates_Ex01.png} +% \end{figure} +% \begin{figure} +% \centering +% \includegraphics{Delta_Ex01.png} +% \end{figure} +% \begin{figure} +% \centering +% \includegraphics{hist_Ex01.png} +% \end{figure} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Bib and Index %%% @@ -707,41 +753,33 @@ Therefore, we use $\mat{A}, \mat{B}, \mat{X}, \mat{F}, \mat{R}, ...$ for matrice \ten{T}\ttm[2]\mat{B}\ &{\widehat{=}}\ \mat{B}\ten{T}_{(2)} & \ten{T}(n, p, q), \ten{T}_{(2)}(p, n q), \mat{B}(q, p) \end{align*} -\section{Matrix Valued Normal Distribution} -A random variable $\mat{X}$ of dimensions $p\times q$ is \emph{Matrix-Valued Normal Distribution}, denoted -\begin{displaymath} - \mat{X}\sim\mathcal{MN}_{p\times q}(\mat{\mu}, \mat{\Delta}_2, \mat{\Delta}_1), -\end{displaymath} -if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otimes\mat\Delta_2)$. Note the order of the covariance matrices $\mat\Delta_1, \mat\Delta_2$. Its density is given by -\begin{displaymath} - f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right). -\end{displaymath} +% \section{Matrix Valued Normal Distribution} +% A random variable $\mat{X}$ of dimensions $p\times q$ is \emph{Matrix-Valued Normal Distribution}, denoted +% \begin{displaymath} +% \mat{X}\sim\mathcal{MN}_{p\times q}(\mat{\mu}, \mat{\Delta}_2, \mat{\Delta}_1), +% \end{displaymath} +% if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otimes\mat\Delta_2)$. Note the order of the covariance matrices $\mat\Delta_1, \mat\Delta_2$. Its density is given by +% \begin{displaymath} +% f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right). +% \end{displaymath} -\section{Sampling form a Multi-Array Normal Distribution} -Let $\ten{X}$ be an order (rank) $r$ Multi-Array random variable of dimensions $p_1\times...\times p_r$ following a Multi-Array (or Tensor) Normal distributed -\begin{displaymath} - \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r). -\end{displaymath} -Its density is given by -\begin{displaymath} - f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1} - \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right) -\end{displaymath} -with $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution -\begin{displaymath} - \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1) -\end{displaymath} -with $p = \prod_{i = 1}^r p_i$. +% \section{Sampling form a Multi-Array Normal Distribution} +% Let $\ten{X}$ be an order (rank) $r$ Multi-Array random variable of dimensions $p_1\times...\times p_r$ following a Multi-Array (or Tensor) Normal distributed +% \begin{displaymath} +% \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r). +% \end{displaymath} +% Its density is given by +% \begin{displaymath} +% f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1} +% \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right) +% \end{displaymath} +% with $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution +% \begin{displaymath} +% \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1) +% \end{displaymath} +% with $p = \prod_{i = 1}^r p_i$. -When sampling from the Multi-Array Normal one way is to sample from the Multi-Variate Normal and then reshaping the result, but this is usually very inefficient because it requires to store the multi-variate covariance matrix which is very big. Instead, it is more efficient to sample $\ten{Z}$ as a tensor of the same shape as $\ten{X}$ with standard normal entries and then transform the $\ten{Z}$ to follow the Multi-Array Normal as follows -\begin{displaymath} - \ten{Z}\sim\mathcal{TN}(0, \mat{I}_{p_1}, ..., \mat{I}_{p_r}) - \quad\Rightarrow\quad - \ten{X} = \ten{Z}\times\{\mat{\Delta}_1^{1/2}, ..., \mat{\Delta}_r^{1/2}\} + \mu\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r). -\end{displaymath} -where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal. - -\todo{Check this!!!} +% \todo{Check this!!!} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Reference Summaries %%% diff --git a/simulations/kpir_sim.R b/simulations/kpir_sim.R index 4eca504..b4ab6d3 100644 --- a/simulations/kpir_sim.R +++ b/simulations/kpir_sim.R @@ -18,17 +18,27 @@ sim <- function(X, Fy, alpha.true, beta.true, max.iter = 500L) { # Logger creator logger <- function(name) { eval(substitute(function(iter, loss, alpha, beta, ...) { - hist[iter + 1L, ] <<- c( - iter = iter, - loss = loss, - dist = (dist <- dist.subspace(c(kronecker(alpha.true, beta.true)), - c(kronecker(alpha, beta)))), - dist.alpha = (dist.alpha <- dist.subspace(c(alpha.true), c(alpha))), - dist.beta = (dist.beta <- dist.subspace(c( beta.true), c(beta ))), - norm.alpha = norm(alpha, "F"), - norm.beta = norm(beta, "F"), - mse = mean((X - mlm(Fy, alpha, beta, modes = 3:2))^2) - ) + tryCatch({ + hist[iter + 1L, ] <<- c( + iter = iter, + loss = loss, + dist = (dist <- dist.subspace(c(kronecker(alpha.true, beta.true)), + c(kronecker(alpha, beta)))), + dist.alpha = (dist.alpha <- dist.subspace(c(alpha.true), c(alpha))), + dist.beta = (dist.beta <- dist.subspace(c( beta.true), c(beta ))), + norm.alpha = norm(alpha, "F"), + norm.beta = norm(beta, "F"), + mse = mean((X - mlm(Fy, alpha, beta, modes = 3:2))^2) + )}, + error = function(e) { + cat("Error in ", name, + ", dim(alpha): ", dim(alpha), + ", dim(alpha.true): ", dim(alpha.true), + ", dim(beta)", dim(beta), + ", dim(beta.true)", dim(beta.true), + "\n") + stop(e) + }) cat(sprintf( "%s(%3d) | l = %-12.4f - dist = %-.4e - alpha(%d, %d) = %-.4e - beta(%d, %d) = %-.4e\n", @@ -40,7 +50,11 @@ sim <- function(X, Fy, alpha.true, beta.true, max.iter = 500L) { } # Initialize logger history targets - hist.base <- hist.new <- hist.momentum <- hist.approx <- hist.ls <- + hist.base <- + hist.new.vlp <- hist.new.ls <- + hist.ls <- + hist.momentum.vlp <- hist.momentum.ls <- + hist.approx.vlp <- hist.approx.ls <- data.frame(iter = seq(0L, max.iter), loss = NA, dist = NA, dist.alpha = NA, dist.beta = NA, norm.alpha = NA, norm.beta = NA, mse = NA @@ -49,53 +63,81 @@ sim <- function(X, Fy, alpha.true, beta.true, max.iter = 500L) { # Base (old) kpir.base(X, Fy, max.iter = max.iter, logger = logger("base")) - # New (simple Gradient Descent) - kpir.new(X, Fy, max.iter = max.iter, logger = logger("new")) + # New (simple Gradient Descent, using VLP initialization) + kpir.new(X, Fy, max.iter = max.iter, init.method = "vlp", + logger = logger("new.vlp")) + kpir.new(X, Fy, max.iter = max.iter, init.method = "ls", + logger = logger("new.ls")) # Least Squares estimate (alternating estimation) kpir.ls(X, Fy, sample.mode = 1L, max.iter = max.iter, logger = logger("ls")) # Gradient Descent with Nesterov Momentum - kpir.momentum(X, Fy, max.iter = max.iter, logger = logger("momentum")) + kpir.momentum(X, Fy, max.iter = max.iter, init.method = "vlp", + logger = logger("momentum.vlp")) + kpir.momentum(X, Fy, max.iter = max.iter, init.method = "ls", + logger = logger("momentum.ls")) # Approximated MLE with Nesterov Momentum - kpir.approx(X, Fy, max.iter = max.iter, logger = logger("approx")) + kpir.approx(X, Fy, max.iter = max.iter, init.method = "vlp", + logger = logger("approx.vlp")) + kpir.approx(X, Fy, max.iter = max.iter, init.method = "ls", + logger = logger("approx.ls")) # Add method tags - hist.base$method <- factor("base") - hist.new$method <- factor("new") - hist.ls$method <- factor("ls") - hist.momentum$method <- factor("momentum") - hist.approx$method <- factor("approx") + hist.base$method <- factor("base") + hist.new.vlp$method <- factor("new") + hist.new.ls$method <- factor("new") + hist.ls$method <- factor("ls") + hist.momentum.vlp$method <- factor("momentum") + hist.momentum.ls$method <- factor("momentum") + hist.approx.vlp$method <- factor("approx") + hist.approx.ls$method <- factor("approx") + # Add init. method tag + hist.base$init <- factor("vlp") + hist.new.vlp$init <- factor("vlp") + hist.new.ls$init <- factor("ls") + hist.ls$init <- factor("ls") + hist.momentum.vlp$init <- factor("vlp") + hist.momentum.ls$init <- factor("ls") + hist.approx.vlp$init <- factor("vlp") + hist.approx.ls$init <- factor("ls") # Combine results and return - rbind(hist.base, hist.new, hist.momentum, hist.approx, hist.ls) + rbind( + hist.base, + hist.new.vlp, hist.new.ls, + hist.ls, + hist.momentum.vlp, hist.momentum.ls, + hist.approx.vlp, hist.approx.ls + ) } ## Plot helper function plot.hist2 <- function(hist, response, type = "all", ...) { # Extract final results from history - sub <- na.omit(hist[c("iter", response, "method", "repetition")]) - sub <- aggregate(sub, list(sub$method, sub$repetition), tail, 1) + sub <- na.omit(hist[c("iter", response, "method", "init", "repetition")]) + sub <- aggregate(sub, list(sub$method, sub$init, sub$repetition), tail, 1) # Setup ggplot p <- ggplot(hist, aes_(x = quote(iter), y = as.name(response), color = quote(method), - group = quote(interaction(method, repetition)))) + linetype = quote(init), + group = quote(interaction(method, repetition, init)))) # Add requested layers if (type == "all") { p <- p + geom_line(na.rm = TRUE) p <- p + geom_point(data = sub) } else if (type == "mean") { - p <- p + geom_line(alpha = 0.5, na.rm = TRUE, linetype = "dotted") - p <- p + geom_point(data = sub, alpha = 0.5) - p <- p + geom_line(aes(group = method), + p <- p + geom_line(alpha = 0.4, na.rm = TRUE, linetype = "dotted") + p <- p + geom_point(data = sub, alpha = 0.4) + p <- p + geom_line(aes(group = interaction(method, init)), stat = "summary", fun = "mean", na.rm = TRUE) } else if (type == "median") { - p <- p + geom_line(alpha = 0.5, na.rm = TRUE, linetype = "dotted") - p <- p + geom_point(data = sub, alpha = 0.5) - p <- p + geom_line(aes(group = method), + p <- p + geom_line(alpha = 0.4, na.rm = TRUE, linetype = "dotted") + p <- p + geom_point(data = sub, alpha = 0.4) + p <- p + geom_line(aes(group = interaction(method, init)), stat = "summary", fun = "median", na.rm = TRUE) } # return with theme and annotations @@ -182,7 +224,7 @@ max.iter <- 2 Delta.1 <- sqrt(0.5)^abs(outer(seq_len(p), seq_len(p), `-`)) Delta.2 <- sqrt(0.5)^abs(outer(seq_len(q), seq_len(q), `-`)) for (rep in 1:reps) { - cat(sprintf("%4d / %d simulation rep. started\n", rep, reps)) + cat(sprintf("\n\033[1m%4d / %d simulation rep. started\033[0m\n", rep, reps)) alpha.1.true <- alpha.1 <- matrix(rnorm(q * r), q, r) alpha.2.true <- alpha.2 <- matrix(rnorm(p * k), p, k) @@ -194,10 +236,10 @@ for (rep in 1:reps) { head(rep(c(0, pi / 2), ceiling(0.5 * k * r)), k * r) )) dim(Fy) <- c(n, k, r) - X <- mlm(Fy, alpha, beta, modes = 3:2) + X <- mlm(Fy, alpha.1, alpha.2, modes = 3:2) X <- X + rtensornorm(n, 0, Delta.1, Delta.2, sample.mode = 1L) - hist.sim <- sim(X, Fy, alpha.true, beta.true, max.iter = max.iter) + hist.sim <- sim(X, Fy, alpha.1.true, alpha.2.true, max.iter = max.iter) hist.sim$repetition <- rep hist <- rbind(hist, hist.sim) @@ -213,17 +255,47 @@ hist$repetition <- factor(hist$repetition) for (response in c("loss", "mse", "dist", "dist.alpha", "dist.beta")) { for (fun in c("all", "mean", "median")) { - print(plot.hist2(hist, response, fun, title = fun) + coord_trans(x = "log1p")) + title <- paste(fun, paste(c("n", "p", "q", "k", "r"), c(n, p, q, k, r), sep = "=", collapse = ", ")) + print(plot.hist2(hist, response, fun, title = title) + + coord_trans(x = "log1p")) dev.print(png, file = sprintf("%s_%s_%s_%s.png", sim.name, datetime, response, fun), width = 768, height = 768, res = 125) if (response != "loss") { - print(plot.hist2(hist, response, fun, title = fun) + coord_trans(x = "log1p", y = "log1p")) + print(plot.hist2(hist, response, fun, title = title) + + coord_trans(x = "log1p", y = "log1p")) dev.print(png, file = sprintf("%s_%s_%s_%s_log.png", sim.name, datetime, response, fun), width = 768, height = 768, res = 125) } } } +stats <- local({ + # final result from history + sub <- na.omit(hist) + sub <- aggregate(sub, list( + method = sub$method, init = sub$init, repetition = sub$repetition + ), tail, 1) + + # aggregate statistics over repetitions + stats.mean <- aggregate(subset(sub, select = c("loss", "mse", "dist.alpha", "dist.beta")), + list(method = sub$method, init = sub$init), mean) + stats.sd <- aggregate(subset(sub, select = c("loss", "mse", "dist.alpha", "dist.beta")), + list(method = sub$method, init = sub$init), sd) + + # merge mean and sd stats together + merge(stats.mean, stats.sd, by = c("method", "init"), suffixes = c(".mean", ".sd")) +}) +print(stats, digits = 2) +# method init loss.mean mse.mean dist.alpha.mean dist.beta.mean loss.sd mse.sd dist.alpha.sd dist.beta.sd +# 1 approx ls 5457 0.99 0.033 0.030 163 0.025 0.017 0.012 +# 2 approx vlp 6819 3.99 0.267 0.287 1995 12.256 0.448 0.457 +# 3 base vlp -2642 1.82 0.248 0.271 1594 2.714 0.447 0.458 +# 4 momentum ls -3479 0.99 0.037 0.035 95 0.025 0.017 0.015 +# 5 momentum vlp -2704 1.78 0.233 0.260 1452 2.658 0.438 0.448 +# 6 new ls -3479 0.99 0.037 0.035 95 0.025 0.017 0.015 +# 7 new vlp -2704 1.78 0.233 0.260 1452 2.658 0.438 0.448 + + ################################################################################ ### Sim 3 ### ################################################################################ diff --git a/tensorPredictors/NAMESPACE b/tensorPredictors/NAMESPACE index 739b05c..05d7405 100644 --- a/tensorPredictors/NAMESPACE +++ b/tensorPredictors/NAMESPACE @@ -15,7 +15,6 @@ export(dist.projection) export(dist.subspace) export(kpir.approx) export(kpir.base) -export(kpir.kron) export(kpir.ls) export(kpir.momentum) export(kpir.new) diff --git a/tensorPredictors/R/CISE.R b/tensorPredictors/R/CISE.R index 5795615..cf19e7c 100644 --- a/tensorPredictors/R/CISE.R +++ b/tensorPredictors/R/CISE.R @@ -25,7 +25,7 @@ #' dataset <- function(name, n = 60, p = 24) { #' name <- toupper(name) #' if (!startsWith('M', name)) { name <- paste0('M', name) } -#' +#' #' if (name %in% c('M1', 'M2', 'M3')) { #' Sigma <- 0.5^abs(outer(1:p, 1:p, `-`)) #' X <- rmvnorm(n, sigma = Sigma) @@ -52,7 +52,7 @@ #' } else { #' stop('Unknown dataset name.') #' } -#' +#' #' list(X = X, y = y, B = B) #' } #' # Sample dataset @@ -63,16 +63,16 @@ #' P.Fy <- Fy %*% solve(crossprod(Fy), t(Fy)) #' M <- with(ds, crossprod(X, P.Fy %*% X) / nrow(X)) # Sigma Fit #' N <- cov(ds$X) # Sigma -#' +#' #' fits <- CISE(M, N, d = ncol(ds$B), Theta = log(seq(1, exp(1e-3), len = 1000))) -#' +#' #' BIC <- unlist(Map(attr, fits, 'BIC')) #' df <- unlist(Map(attr, fits, 'df')) #' dist <- unlist(Map(attr, fits, 'dist')) #' iter <- unlist(Map(attr, fits, 'iter')) #' theta <- unlist(Map(attr, fits, 'theta')) #' p.theta <- unlist(Map(function(V) sum(rowSums(V^2) > 1e-9), fits)) -#' +#' #' par(mfrow = c(2, 2)) #' plot(theta, BIC, type = 'l') #' plot(theta, p.theta, type = 'l') @@ -84,7 +84,9 @@ #' Reduction and Variable Selection" By Xin Chen, Changliang Zou and #' R. Dennis Cook. #' -#' @suggest RSpectra +#' @note for speed reasons this functions attempts to use +#' \code{\link[RSpectra]{eigs_sym}} if \pkg{\link{RSpectra}} is installed, +#' otherwise \code{\link{eigen}} is used which might be significantly slower. #' #' @export CISE <- function(M, N, d = 1L, method = "PFC", max.iter = 100L, Theta = NULL, diff --git a/tensorPredictors/R/kpir_approx.R b/tensorPredictors/R/kpir_approx.R index 5520a9a..55b7b46 100644 --- a/tensorPredictors/R/kpir_approx.R +++ b/tensorPredictors/R/kpir_approx.R @@ -9,6 +9,7 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), max.iter = 500L, max.line.iter = 50L, step.size = 1e-3, nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)), + max.init.iter = 20L, init.method = c("ls", "vlp"), eps = .Machine$double.eps, logger = NULL ) { @@ -17,7 +18,7 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), stopifnot(nrow(X) == NROW(Fy)) n <- nrow(X) # Number of observations - # Get and check predictor dimensions + # Check predictor dimensions if (length(dim(X)) == 2L) { stopifnot(!missing(shape)) stopifnot(ncol(X) == prod(shape[1:2])) @@ -26,18 +27,16 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), } else if (length(dim(X)) == 3L) { p <- dim(X)[2] q <- dim(X)[3] - dim(X) <- c(n, p * q) } else { stop("'X' must be a matrix or 3-tensor") } - # Get and check response dimensions + # Check response dimensions if (!is.array(Fy)) { Fy <- as.array(Fy) } if (length(dim(Fy)) == 1L) { k <- r <- 1L - dim(Fy) <- c(n, 1L) } else if (length(dim(Fy)) == 2L) { stopifnot(!missing(shape)) stopifnot(ncol(Fy) == prod(shape[3:4])) @@ -46,34 +45,44 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), } else if (length(dim(Fy)) == 3L) { k <- dim(Fy)[2] r <- dim(Fy)[3] - dim(Fy) <- c(n, k * r) } else { stop("'Fy' must be a vector, matrix or 3-tensor") } - ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon` - # Vectorize - dim(Fy) <- c(n, k * r) - dim(X) <- c(n, p * q) - # Solve - cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace - if (n <= k * r || qr(cpFy)$rank < k * r) { - # In case of under-determined system replace the inverse in the normal - # equation by the Moore-Penrose Pseudo Inverse - B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X)) - } else { - # Compute OLS estimate by the Normal Equation - B <- t(solve(cpFy, crossprod(Fy, X))) + ### Step 1: (Approx) Least Squares initial estimate + init.method <- match.arg(init.method) + if (init.method == "ls") { + # De-Vectroize (from now on tensor arithmetics) + dim(Fy) <- c(n, k, r) + dim(X) <- c(n, p, q) + + ls <- kpir.ls(X, Fy, max.iter = max.init.iter, sample.mode = 1L, eps = eps) + c(beta0, alpha0) %<-% ls$alphas + } else { # Van Loan and Pitsianis + # Vectorize + dim(Fy) <- c(n, k * r) + dim(X) <- c(n, p * q) + + # solution for `X = Fy B' + epsilon` + cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace + if (n <= k * r || qr(cpFy)$rank < k * r) { + # In case of under-determined system replace the inverse in the normal + # equation by the Moore-Penrose Pseudo Inverse + B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X)) + } else { + # Compute OLS estimate by the Normal Equation + B <- t(solve(cpFy, crossprod(Fy, X))) + } + + # De-Vectroize (from now on tensor arithmetics) + dim(Fy) <- c(n, k, r) + dim(X) <- c(n, p, q) + + # Decompose `B = alpha x beta` into `alpha` and `beta` + c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k)) } - # De-Vectroize (from now on tensor arithmetics) - dim(Fy) <- c(n, k, r) - dim(X) <- c(n, p, q) - - # Decompose `B = alpha x beta` into `alpha` and `beta` - c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k)) - # Compute residuals R <- X - (Fy %x_3% alpha0 %x_2% beta0) diff --git a/tensorPredictors/R/kpir_momentum.R b/tensorPredictors/R/kpir_momentum.R index 9471171..3e26449 100644 --- a/tensorPredictors/R/kpir_momentum.R +++ b/tensorPredictors/R/kpir_momentum.R @@ -5,6 +5,7 @@ kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), max.iter = 500L, max.line.iter = 50L, step.size = 1e-3, nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)), + max.init.iter = 20L, init.method = c("ls", "vlp"), eps = .Machine$double.eps, logger = NULL ) { @@ -48,19 +49,30 @@ kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), } - ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon` - cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace - if (n <= k * r || qr(cpFy)$rank < k * r) { - # In case of under-determined system replace the inverse in the normal - # equation by the Moore-Penrose Pseudo Inverse - B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X)) - } else { - # Compute OLS estimate by the Normal Equation - B <- t(solve(cpFy, crossprod(Fy, X))) - } + ### Step 1: (Approx) Least Squares initial estimate + init.method <- match.arg(init.method) + if (init.method == "ls") { + dim(X) <- c(n, p, q) + dim(Fy) <- c(n, k, r) + ls <- kpir.ls(X, Fy, max.iter = max.init.iter, sample.mode = 1L, eps = eps) + c(beta0, alpha0) %<-% ls$alphas + dim(X) <- c(n, p * q) + dim(Fy) <- c(n, k * r) + } else { # Van Loan and Pitsianis + # solution for `X = Fy B' + epsilon` + cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace + if (n <= k * r || qr(cpFy)$rank < k * r) { + # In case of under-determined system replace the inverse in the normal + # equation by the Moore-Penrose Pseudo Inverse + B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X)) + } else { + # Compute OLS estimate by the Normal Equation + B <- t(solve(cpFy, crossprod(Fy, X))) + } - # Decompose `B = alpha x beta` into `alpha` and `beta` - c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k)) + # Decompose `B = alpha x beta` into `alpha` and `beta` + c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k)) + } # Compute residuals resid <- X - tcrossprod(Fy, kronecker(alpha0, beta0)) diff --git a/tensorPredictors/R/kpir_new.R b/tensorPredictors/R/kpir_new.R index e1bdfff..0815e18 100644 --- a/tensorPredictors/R/kpir_new.R +++ b/tensorPredictors/R/kpir_new.R @@ -3,6 +3,7 @@ #' @export kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), max.iter = 500L, max.line.iter = 50L, step.size = 1e-3, + max.init.iter = 20L, init.method = c("ls", "vlp"), eps = .Machine$double.eps, logger = NULL ) { @@ -45,19 +46,30 @@ kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), stop("'Fy' must be a vector, matrix or 3-tensor") } - ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon` - cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace - if (n <= k * r || qr(cpFy)$rank < k * r) { - # In case of under-determined system replace the inverse in the normal - # equation by the Moore-Penrose Pseudo Inverse - B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X)) - } else { - # Compute OLS estimate by the Normal Equation - B <- t(solve(cpFy, crossprod(Fy, X))) - } + ### Step 1: (Approx) Least Squares initial estimate + init.method <- match.arg(init.method) + if (init.method == "ls") { + dim(X) <- c(n, p, q) + dim(Fy) <- c(n, k, r) + ls <- kpir.ls(X, Fy, max.iter = max.init.iter, sample.mode = 1L, eps = eps) + c(beta, alpha) %<-% ls$alphas + dim(X) <- c(n, p * q) + dim(Fy) <- c(n, k * r) + } else { # Van Loan and Pitsianis + # solution for `X = Fy B' + epsilon` + cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace + if (n <= k * r || qr(cpFy)$rank < k * r) { + # In case of under-determined system replace the inverse in the normal + # equation by the Moore-Penrose Pseudo Inverse + B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X)) + } else { + # Compute OLS estimate by the Normal Equation + B <- t(solve(cpFy, crossprod(Fy, X))) + } - # Decompose `B = alpha x beta` into `alpha` and `beta` - c(alpha, beta) %<-% approx.kronecker(B, c(q, r), c(p, k)) + # Decompose `B = alpha x beta` into `alpha` and `beta` + c(alpha, beta) %<-% approx.kronecker(B, c(q, r), c(p, k)) + } # Compute residuals resid <- X - tcrossprod(Fy, kronecker(alpha, beta))