fix: added second argument to mcrossprod,
add: kpir.mle, wip: LaTeX
This commit is contained in:
parent
6455d27386
commit
1838da8b3d
|
@ -293,13 +293,13 @@ with $\pi$ being a permutation of $p q k r$ elements corresponding to permuting
|
||||||
\section{Kronecker Covariance Structure}
|
\section{Kronecker Covariance Structure}
|
||||||
As before we let the sample model for tensor valued opbservations and responses or oder $r$ be
|
As before we let the sample model for tensor valued opbservations and responses or oder $r$ be
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
\ten{X} = \ten{\mu} + \ten{F}\times_{j\in[r]}\alpha_j + \ten{\epsilon}
|
\ten{X} = \ten{\mu} + \ten{F}\times_{j\in[r]}\mat{\alpha}_j + \ten{\epsilon}
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
but the error tensor $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ is Tensor Normal distributed with mean zero and covariance matrices $\mat{\Delta}_1, ..., \mat{\Delta}_r$.
|
but the error tensor $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ is Tensor Normal distributed with mean zero and covariance matrices $\mat{\Delta}_1, ..., \mat{\Delta}_r$.
|
||||||
|
|
||||||
The sample model for $n$ observations has the same form with an additional sample axis in the last mode of $\ten{X}$ and $\ten{Y}$ with dimensions $p_1\times ...\times p_r\times n$ and $q_1\times ...\times q_r\times n$, respectively. Let the residual tensor be
|
The sample model for $n$ observations has the same form with an additional sample axis in the last mode of $\ten{X}$ and $\ten{Y}$ with dimensions $p_1\times ...\times p_r\times n$ and $q_1\times ...\times q_r\times n$, respectively. Let the residual tensor be
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
\ten{R} = \ten{X} - \ten{\mu} + \ten{F}\times_{j\in[r]}\alpha_j.
|
\ten{R} = \ten{X} - \ten{\mu} - \ten{F}\times_{j\in[r]}\mat{\alpha}_j.
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
|
|
||||||
By the definition of the Tensor Normal, using the notation $p_{\lnot j} = \prod_{k\neq j}p_j$, we get for observations $\ten{X}, \ten{F}$ the log-likelihood in terms of the residuals as
|
By the definition of the Tensor Normal, using the notation $p_{\lnot j} = \prod_{k\neq j}p_j$, we get for observations $\ten{X}, \ten{F}$ the log-likelihood in terms of the residuals as
|
||||||
|
@ -315,7 +315,7 @@ For deriving the MLE estimates we compute the differential of the log-likelihood
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
\d l =
|
\d l =
|
||||||
-\sum_{j = 1}^r \frac{n p_{\lnot j}}{2}\d\log|{\mat{\Delta}}_j|
|
-\sum_{j = 1}^r \frac{n p_{\lnot j}}{2}\d\log|{\mat{\Delta}}_j|
|
||||||
-\frac{1}{2}\sum_{j = 1}^r\langle {\ten{R}}\times_{k\in[r]\backslash j}{\mat{\Delta}}_k^{-1}\times_j\d{\mat{\Delta}}^{-1}_j \rangle
|
-\frac{1}{2}\sum_{j = 1}^r\langle {\ten{R}}\times_{k\in[r]\backslash j}{\mat{\Delta}}_k^{-1}\times_j\d{\mat{\Delta}}^{-1}_j, \ten{R} \rangle
|
||||||
-\langle {\ten{R}}\times_{j\in[r]}{\mat{\Delta}}_j^{-1}, \d{\ten{R}} \rangle.
|
-\langle {\ten{R}}\times_{j\in[r]}{\mat{\Delta}}_j^{-1}, \d{\ten{R}} \rangle.
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
Using $\d\log|\mat{A}| = \tr(\mat{A}^{-1}\d\mat{A})$ and $\d\mat{A}^{-1} = -\mat{A}^{-1}(\d\mat{A})\mat{A}^{-1}$ as well as $\langle\ten{A}, \ten{B}\rangle = \tr(\ten{A}_{(j)}\t{\ten{B}_{(j)}})$ for any $j = 1, ..., r$ we get the differential of the estimated log-likelihood as
|
Using $\d\log|\mat{A}| = \tr(\mat{A}^{-1}\d\mat{A})$ and $\d\mat{A}^{-1} = -\mat{A}^{-1}(\d\mat{A})\mat{A}^{-1}$ as well as $\langle\ten{A}, \ten{B}\rangle = \tr(\ten{A}_{(j)}\t{\ten{B}_{(j)}})$ for any $j = 1, ..., r$ we get the differential of the estimated log-likelihood as
|
||||||
|
@ -349,14 +349,43 @@ as well as gradients (even though they are not realy used, except in the case of
|
||||||
We continue by substitution of the covariance estimates and get
|
We continue by substitution of the covariance estimates and get
|
||||||
\begin{align*}
|
\begin{align*}
|
||||||
\d\hat{l} &= -\langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \d\ten{R} \rangle \\
|
\d\hat{l} &= -\langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \d\ten{R} \rangle \\
|
||||||
&= \sum_{j = 1}^r \langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k\times_j\d\widehat{\mat{\alpha}}_j \rangle \\
|
&= \sum_{j = 1}^r \langle \ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1}, \ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k\times_j\d\widehat{\mat{\alpha}}_j \rangle \\
|
||||||
&= \sum_{j = 1}^r \tr\big( (\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1})_{(j)}}\d\widehat{\mat{\alpha}}_j \big).
|
&= \sum_{j = 1}^r \tr\big( (\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}}\d\widehat{\mat{\alpha}}_j \big).
|
||||||
\end{align*}
|
\end{align*}
|
||||||
Through that the gradient for all the parameter matrices is
|
Through that the gradient for all the parameter matrices is
|
||||||
\begin{align*}
|
\begin{align*}
|
||||||
\nabla_{\widehat{\mat{\alpha}}_j}\hat{l} &= (\ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}
|
\nabla_{\widehat{\mat{\alpha}}_j}\hat{l} &= (\ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}
|
||||||
\qquad{\color{gray}p_j\times q_j} && j = 1, ..., r.
|
\qquad{\color{gray}p_j\times q_j} && j = 1, ..., r.
|
||||||
\end{align*}
|
\end{align*}
|
||||||
|
By equating the gradients to zero and expanding $\ten{R}$ we get a system of normal equations
|
||||||
|
\begin{gather*}
|
||||||
|
0\overset{!}{=} \nabla_{\widehat{\mat{\alpha}}_j}\hat{l}
|
||||||
|
= ((\ten{X}-\widehat{\ten{\mu}}-\ten{F}\times_{l\in[r]}\widehat{\mat{\alpha}}_l)\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}} \\
|
||||||
|
%
|
||||||
|
((\ten{X}-\widehat{\ten{\mu}})\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}} = (\ten{F}\times_{k\in[r]}\widehat{\mat{\Delta}}_k^{-1}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}
|
||||||
|
\end{gather*}
|
||||||
|
with cross dependent solutions for the MLE estimates $\widehat{\mat{\alpha}}$ given by
|
||||||
|
\begin{displaymath}
|
||||||
|
\widehat{\mat{\alpha}}_j = \widehat{\mat{\Delta}}_j((\ten{X}-\widehat{\ten{\mu}})\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}[(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\Delta}}_k^{-1}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}]^{-1}.
|
||||||
|
\end{displaymath}
|
||||||
|
for $j = 1, ..., r$.
|
||||||
|
|
||||||
|
\begin{example}[Simple multivariate case ($r = 1$)]
|
||||||
|
Lets consider the case of $r = 1$, this is the usual multivariate regression case for vector valued predictors. Here $\mat{F}$ denotes the $n\times q$ centered model matrix and the responses are collected in the $n\times p$ matrix $\mat{X}$ with the $n$ observations in the rows and the variables by columns. Furthermore, let the residual estimate matrix $\mat{R} = \mat{X} - \mat{F}\t{\widehat{\mat{\alpha}}_1}$. In this case the tensor MLE estimates simplify to there well known form
|
||||||
|
\begin{align*}
|
||||||
|
\widehat{\mat{\alpha}}_1
|
||||||
|
&= \widehat{\mat{\Delta}}_1((\ten{X}-\widehat{\ten{\mu}})\times_{k\in[1]}{\widehat{\mat{\Delta}}}_k^{-1})_{(1)}\t{(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(j)}}[(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\Delta}}_k^{-1}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(j)}}]^{-1} \\
|
||||||
|
&= \widehat{\mat{\Delta}}_1\widehat{\mat{\Delta}}_1^{-1}((\ten{X}-\widehat{\ten{\mu}}))_{(1)}\t{\ten{F}_{(1)}}[\ten{F}_{(1)}\t{\ten{F}_{(1)}}]^{-1} \\
|
||||||
|
&= \t{\mat{X}}\mat{F}(\t{\mat{F}}\mat{F})^{-1}
|
||||||
|
\end{align*}
|
||||||
|
as well as the covariance estimate
|
||||||
|
\begin{align*}
|
||||||
|
\widehat{\mat{\Delta}}_1
|
||||||
|
&= \frac{1}{n}(\ten{R}\times_{k\in\emptyset}{\widehat{\mat{\Delta}}}_k^{-1})_{(1)}\t{\ten{R}_{(1)}} = \frac{1}{n}\t{\mat{R}}\mat{R}
|
||||||
|
= \frac{1}{n}\t{(\mat{X} - \mat{F}\t{\widehat{\mat{\alpha}}_1})}(\mat{X} - \mat{F}\t{\widehat{\mat{\alpha}}_1}).
|
||||||
|
\end{align*}
|
||||||
|
Note that $\ten{F}_{(1)} = \t{\mat{F}}$, $\ten{X}_{(1)} = \t{\mat{X}}$ and $\ten{R}_{(1)} = \t{\mat{R}}$.
|
||||||
|
\end{example}
|
||||||
|
|
||||||
\paragraph{Comparison to the general case:} {\color{red} There are two main differences, first the general case has a closed form solution for the gradient due to the explicit nature of the MLE estimate of $\widehat{\mat\Delta}$ compared to the mutually dependent MLE estimates $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$. On the other hand the general case has dramatically bigger dimensions of the covariance matrix ($p q \times p q$) compared to the two Kronecker components with dimensions $q \times q$ and $p \times p$. This means that in the general case there is a huge performance penalty in the dimensions of $\widehat{\mat\Delta}$ while in the other case an extra estimation is required to determine $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$.}
|
\paragraph{Comparison to the general case:} {\color{red} There are two main differences, first the general case has a closed form solution for the gradient due to the explicit nature of the MLE estimate of $\widehat{\mat\Delta}$ compared to the mutually dependent MLE estimates $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$. On the other hand the general case has dramatically bigger dimensions of the covariance matrix ($p q \times p q$) compared to the two Kronecker components with dimensions $q \times q$ and $p \times p$. This means that in the general case there is a huge performance penalty in the dimensions of $\widehat{\mat\Delta}$ while in the other case an extra estimation is required to determine $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$.}
|
||||||
|
|
||||||
|
@ -508,11 +537,11 @@ and therefore the gradients
|
||||||
|
|
||||||
\section{Thoughts on initial value estimation}
|
\section{Thoughts on initial value estimation}
|
||||||
\todo{This section uses an alternative notation as it already tries to generalize to general multi-dimensional arrays. Furthermore, one of the main differences is that the observation are indexed in the \emph{last} mode. The benefit of this is that the mode product and parameter matrix indices match not only in the population model but also in sample versions.}
|
\todo{This section uses an alternative notation as it already tries to generalize to general multi-dimensional arrays. Furthermore, one of the main differences is that the observation are indexed in the \emph{last} mode. The benefit of this is that the mode product and parameter matrix indices match not only in the population model but also in sample versions.}
|
||||||
Let $\ten{X}, \ten{F}$ be order $r$ tensors of dimensions $p_1\times ... \times p_r$ and $q_1\times ... \times q_r$, respectively. Also denote the error tensor $\epsilon$ of the same order and dimensions as $\ten{X}$. The considered model for the $i$'th observation is
|
Let $\ten{X}_i, \ten{F}_i$ be order $r$ tensors of dimensions $p_1\times ... \times p_r$ and $q_1\times ... \times q_r$, respectively. Also denote the error tensor $\epsilon$ of the same order and dimensions as $\ten{X}_i$. The considered model for the $i$'th observation is
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
\ten{X}_i = \ten{\mu} + \ten{F}_i\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}_i
|
\ten{X}_i = \ten{\mu} + \ten{F}_i\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}_i
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations
|
where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}_i\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
\ten{X} = \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon}
|
\ten{X} = \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon}
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
|
|
|
@ -18,6 +18,7 @@ export(hopca)
|
||||||
export(kpir.approx)
|
export(kpir.approx)
|
||||||
export(kpir.base)
|
export(kpir.base)
|
||||||
export(kpir.ls)
|
export(kpir.ls)
|
||||||
|
export(kpir.mle)
|
||||||
export(kpir.momentum)
|
export(kpir.momentum)
|
||||||
export(kpir.new)
|
export(kpir.new)
|
||||||
export(mat)
|
export(mat)
|
||||||
|
|
|
@ -2,11 +2,11 @@
|
||||||
#'
|
#'
|
||||||
#' This is a special case of the "Khatri-Rao Product".
|
#' This is a special case of the "Khatri-Rao Product".
|
||||||
#'
|
#'
|
||||||
#' @seealso \link{\code{rowKronecker}}
|
#' @seealso \code{\link{rowKronecker}}
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
colKronecker <- function(A, B) {
|
colKronecker <- function(A, B) {
|
||||||
sapply(1:ncol(A), function(i) kronecker(A[, i], B[, i]))
|
sapply(seq_len(ncol(A)), function(i) kronecker(A[, i], B[, i]))
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Row wise kronecker product
|
#' Row wise kronecker product
|
||||||
|
@ -14,7 +14,7 @@ colKronecker <- function(A, B) {
|
||||||
#' Also known as the "Face-splitting product". This is a special case of the
|
#' Also known as the "Face-splitting product". This is a special case of the
|
||||||
#' "Khatri-Rao Product".
|
#' "Khatri-Rao Product".
|
||||||
#'
|
#'
|
||||||
#' @seealso \link{\code{colKronecker}}
|
#' @seealso \code{\link{colKronecker}}
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
rowKronecker <- function(A, B) {
|
rowKronecker <- function(A, B) {
|
||||||
|
|
|
@ -161,7 +161,7 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||||
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
|
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
|
||||||
|
|
||||||
# backtracking loop
|
# backtracking loop
|
||||||
for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) {
|
for (delta in step.size * 0.618034^seq.int(0L, length = max.line.iter)) {
|
||||||
# Update `alpha` and `beta` (note: add(+), the gradients are already
|
# Update `alpha` and `beta` (note: add(+), the gradients are already
|
||||||
# pointing into the negative slope direction of the loss cause they are
|
# pointing into the negative slope direction of the loss cause they are
|
||||||
# the gradients of the log-likelihood [NOT the negative log-likelihood])
|
# the gradients of the log-likelihood [NOT the negative log-likelihood])
|
||||||
|
|
|
@ -14,8 +14,6 @@ kpir.ls <- function(X, Fy, max.iter = 20L, sample.axis = 1L,
|
||||||
dims[sample.axis] <- length(Fy)
|
dims[sample.axis] <- length(Fy)
|
||||||
dims
|
dims
|
||||||
})
|
})
|
||||||
} else {
|
|
||||||
stopifnot(dim(X)[sample.axis] == dim(Fy)[sample.axis])
|
|
||||||
}
|
}
|
||||||
# Check dimensions
|
# Check dimensions
|
||||||
stopifnot(length(dim(X)) == length(dim(Fy)))
|
stopifnot(length(dim(X)) == length(dim(Fy)))
|
||||||
|
@ -60,8 +58,6 @@ kpir.ls <- function(X, Fy, max.iter = 20L, sample.axis = 1L,
|
||||||
Deltas <- Map(mcrossprod, list(R), mode = modes)
|
Deltas <- Map(mcrossprod, list(R), mode = modes)
|
||||||
Deltas <- Map(`*`, 1 / dim(X)[sample.axis], Deltas)
|
Deltas <- Map(`*`, 1 / dim(X)[sample.axis], Deltas)
|
||||||
|
|
||||||
list(
|
list(alphas = structure(alphas, names = as.character(modes)),
|
||||||
alphas = structure(alphas, names = as.character(modes)),
|
Deltas = structure(Deltas, names = as.character(modes)))
|
||||||
Deltas = structure(Deltas, names = as.character(modes))
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,90 @@
|
||||||
|
#' Per mode (axis) MLE
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
kpir.mle <- function(X, Fy, max.iter = 500L, sample.axis = 1L,
|
||||||
|
logger = NULL #, eps = .Machine$double.eps
|
||||||
|
) {
|
||||||
|
### Step 0: Setup/Initialization
|
||||||
|
if (!is.array(Fy)) {
|
||||||
|
# scalar response case (add new axis of size 1)
|
||||||
|
dim(Fy) <- ifelse(seq_along(dim(X)) == sample.axis, dim(X)[sample.axis], 1L)
|
||||||
|
}
|
||||||
|
# Check dimensions and matching of axis (tensor order)
|
||||||
|
stopifnot(exprs = {
|
||||||
|
length(dim(X)) == length(dim(Fy))
|
||||||
|
dim(X)[sample.axis] == dim(Fy)[sample.axis]
|
||||||
|
})
|
||||||
|
# warn about model constraints
|
||||||
|
if (any(dim(Fy)[-sample.axis] >= dim(X)[-sample.axis])) {
|
||||||
|
warning("Degenerate case 'any(dim(Fy)[-sample.axis] >= dim(X)[-sample.axis])'")
|
||||||
|
}
|
||||||
|
|
||||||
|
# extract dimensions (for easier handling as local variables)
|
||||||
|
modes <- seq_along(dim(X))[-sample.axis] # predictor axis indices
|
||||||
|
n <- dim(X)[sample.axis] # observation count (scalar)
|
||||||
|
p <- dim(X)[-sample.axis] # predictor dimensions (vector)
|
||||||
|
# q <- dim(Fy)[-sample.axis] # response dimensions (vector)
|
||||||
|
# r <- length(dim(X)) - 1L # tensor order (scalar)
|
||||||
|
|
||||||
|
# Means for X and Fy (a.k.a. sum elements over the sample axis)
|
||||||
|
meanX <- apply(X, modes, mean, simplify = TRUE)
|
||||||
|
meanFy <- apply(Fy, modes, mean, simplify = TRUE)
|
||||||
|
# Center both X and Fy
|
||||||
|
X <- sweep(X, modes, meanX)
|
||||||
|
Fy <- sweep(Fy, modes, meanFy)
|
||||||
|
|
||||||
|
|
||||||
|
### Step 1: Initial value estimation
|
||||||
|
alphas <- Map(function(mode, ncol) {
|
||||||
|
La.svd(mcrossprod(X, mode = mode), ncol)$u
|
||||||
|
}, modes, dim(Fy)[modes])
|
||||||
|
# Residuals
|
||||||
|
R <- X - mlm(Fy, alphas, modes = modes)
|
||||||
|
# Covariance Moment estimates
|
||||||
|
Deltas <- Map(mcrossprod, list(R), mode = modes)
|
||||||
|
Deltas <- Map(function(Delta, j) (n * prod(p[-j]))^(-1) * Delta,
|
||||||
|
Deltas, seq_along(Deltas))
|
||||||
|
|
||||||
|
# Call history callback (logger) before the first iteration
|
||||||
|
if (is.function(logger)) { do.call(logger, c(0L, NA, alphas, Deltas)) }
|
||||||
|
|
||||||
|
|
||||||
|
### Step 2: Alternating estimate updates
|
||||||
|
for (iter in seq_len(max.iter)) {
|
||||||
|
# Compute covariance inverses
|
||||||
|
Deltas.inv <- Map(solve, Deltas)
|
||||||
|
|
||||||
|
# "standardize" X
|
||||||
|
Z <- mlm(X, Deltas.inv, modes = modes)
|
||||||
|
|
||||||
|
# Compute new alpha estimates
|
||||||
|
alphas <- Map(function(j) {
|
||||||
|
# MLE estimate for alpha_j | alpha_k, Delta_l for all k != j and l
|
||||||
|
FF <- mlm(Fy, alphas[-j], modes = modes[-j])
|
||||||
|
Deltas[[j]] %*% t(solve(
|
||||||
|
t(mcrossprod(mlm(FF, Deltas.inv[-j], modes = modes[-j]), FF, mode = modes[j])),
|
||||||
|
t(mcrossprod(Z, FF, mode = modes[j]))))
|
||||||
|
}, seq_along(modes))
|
||||||
|
|
||||||
|
# update residuals
|
||||||
|
R <- X - mlm(Fy, alphas, modes = modes)
|
||||||
|
|
||||||
|
# next Delta estimates
|
||||||
|
Deltas <- Map(function(j) {
|
||||||
|
# MLE estimate for Delta_j | Delta_k, alpha_l for all k != j and l
|
||||||
|
(n * prod(p[-j]))^(-1) * mcrossprod(
|
||||||
|
mlm(R, Deltas[-j], modes = modes[-j]), R, mode = modes[j])
|
||||||
|
}, seq_along(modes))
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: break condition!!!
|
||||||
|
|
||||||
|
|
||||||
|
# Call history callback
|
||||||
|
if (is.function(logger)) { do.call(logger, c(iter, NA, alphas, Deltas)) }
|
||||||
|
}
|
||||||
|
|
||||||
|
list(alphas = structure(alphas, names = as.character(modes)),
|
||||||
|
Deltas = structure(Deltas, names = as.character(modes)),
|
||||||
|
meanX = meanX, meanFy = meanFy)
|
||||||
|
}
|
|
@ -137,7 +137,7 @@ kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||||
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
|
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
|
||||||
|
|
||||||
# backtracking loop
|
# backtracking loop
|
||||||
for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) {
|
for (delta in step.size * 0.618034^seq.int(0L, length = max.line.iter)) {
|
||||||
# Update `alpha` and `beta` (note: add(+), the gradients are already
|
# Update `alpha` and `beta` (note: add(+), the gradients are already
|
||||||
# pointing into the negative slope direction of the loss cause they are
|
# pointing into the negative slope direction of the loss cause they are
|
||||||
# the gradients of the log-likelihood [NOT the negative log-likelihood])
|
# the gradients of the log-likelihood [NOT the negative log-likelihood])
|
||||||
|
|
|
@ -112,7 +112,7 @@ kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||||
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
|
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
|
||||||
|
|
||||||
# Line Search loop
|
# Line Search loop
|
||||||
for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) {
|
for (delta in step.size * 0.618034^seq.int(0L, length = max.line.iter)) {
|
||||||
# Update `alpha` and `beta` (note: add(+), the gradients are already
|
# Update `alpha` and `beta` (note: add(+), the gradients are already
|
||||||
# pointing into the negative slope direction of the loss cause they are
|
# pointing into the negative slope direction of the loss cause they are
|
||||||
# the gradients of the log-likelihood [NOT the negative log-likelihood])
|
# the gradients of the log-likelihood [NOT the negative log-likelihood])
|
||||||
|
|
|
@ -14,8 +14,41 @@
|
||||||
#' @note equivalent to \code{tcrossprod(mat(A, mode))} with around the same
|
#' @note equivalent to \code{tcrossprod(mat(A, mode))} with around the same
|
||||||
#' performance but only allocates the result matrix.
|
#' performance but only allocates the result matrix.
|
||||||
#'
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' dimA <- c(2, 5, 7, 11)
|
||||||
|
#' A <- array(rnorm(prod(dimA)), dim = dimA)
|
||||||
|
#' stopifnot(exprs = {
|
||||||
|
#' all.equal(mcrossprod(A, mode = 1), tcrossprod(mat(A, 1)))
|
||||||
|
#' all.equal(mcrossprod(A, , 2), tcrossprod(mat(A, 2)))
|
||||||
|
#' all.equal(mcrossprod(A, , mode = 3), tcrossprod(mat(A, 3)))
|
||||||
|
#' all.equal(mcrossprod(A, A, 4), tcrossprod(mat(A, 4)))
|
||||||
|
#' })
|
||||||
|
#'
|
||||||
|
#' dimA <- c(2, 5, 7, 11)
|
||||||
|
#' dimB <- c(3, 5, 7, 11) # same as dimA except 1. entry
|
||||||
|
#' A <- array(rnorm(prod(dimA)), dim = dimA)
|
||||||
|
#' B <- array(rnorm(prod(dimB)), dim = dimB)
|
||||||
|
#' stopifnot(all.equal(
|
||||||
|
#' mcrossprod(A, B, 1),
|
||||||
|
#' tcrossprod(mat(A, 1), mat(B, 1))
|
||||||
|
#' ))
|
||||||
|
#'
|
||||||
|
#' dimA <- c(5, 2, 7, 11)
|
||||||
|
#' dimB <- c(5, 3, 7, 11) # same as dimA except 2. entry
|
||||||
|
#' A <- array(rnorm(prod(dimA)), dim = dimA)
|
||||||
|
#' B <- array(rnorm(prod(dimB)), dim = dimB)
|
||||||
|
#' stopifnot(all.equal(
|
||||||
|
#' mcrossprod(A, B, mode = 2L),
|
||||||
|
#' tcrossprod(mat(A, 2), mat(B, 2))
|
||||||
|
#' ))
|
||||||
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
mcrossprod <- function(A, mode = length(dim(A))) {
|
mcrossprod <- function(A, B, mode = length(dim(A))) {
|
||||||
storage.mode(A) <- "double"
|
storage.mode(A) <- "double"
|
||||||
.Call("C_mcrossprod", A, as.integer(mode))
|
if (missing(B)) {
|
||||||
|
.Call("C_mcrossprod_sym", A, as.integer(mode))
|
||||||
|
} else {
|
||||||
|
storage.mode(B) <- "double"
|
||||||
|
.Call("C_mcrossprod", A, B, as.integer(mode))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,7 +23,7 @@
|
||||||
#' stopifnot(all.equal(C1, C4))
|
#' stopifnot(all.equal(C1, C4))
|
||||||
#'
|
#'
|
||||||
#' # selected modes
|
#' # selected modes
|
||||||
#' C1 <- mlm(A, B, modes = 2:3)
|
#' C1 <- mlm(A, B[2:3], modes = 2:3)
|
||||||
#' C2 <- mlm(A, B[[2]], B[[3]], modes = 2:3)
|
#' C2 <- mlm(A, B[[2]], B[[3]], modes = 2:3)
|
||||||
#' C3 <- ttm(ttm(A, B[[2]], 2), B[[3]], 3)
|
#' C3 <- ttm(ttm(A, B[[2]], 2), B[[3]], 3)
|
||||||
#' stopifnot(all.equal(C1, C2))
|
#' stopifnot(all.equal(C1, C2))
|
||||||
|
@ -32,7 +32,7 @@
|
||||||
#' # analog to matrix multiplication
|
#' # analog to matrix multiplication
|
||||||
#' A <- matrix(rnorm( 6), 2, 3)
|
#' A <- matrix(rnorm( 6), 2, 3)
|
||||||
#' B <- matrix(rnorm(12), 3, 4)
|
#' B <- matrix(rnorm(12), 3, 4)
|
||||||
#' C <- matrix(rnorm(20), 4, 5)
|
#' C <- matrix(rnorm(20), 5, 4)
|
||||||
#' stopifnot(all.equal(
|
#' stopifnot(all.equal(
|
||||||
#' A %*% B %*% t(C),
|
#' A %*% B %*% t(C),
|
||||||
#' mlm(B, list(A, C))
|
#' mlm(B, list(A, C))
|
||||||
|
@ -56,6 +56,16 @@
|
||||||
#' stopifnot(all.equal(P1, P2))
|
#' stopifnot(all.equal(P1, P2))
|
||||||
#' stopifnot(all.equal(P1, P3))
|
#' stopifnot(all.equal(P1, P3))
|
||||||
#'
|
#'
|
||||||
|
#' Concatination of MLM is MLM
|
||||||
|
#' dimX <- c(4, 8, 6, 3)
|
||||||
|
#' dimA <- c(3, 17, 19, 2)
|
||||||
|
#' dimB <- c(7, 11, 13, 5)
|
||||||
|
#' X <- array(rnorm(prod(dimX)), dim = dimX)
|
||||||
|
#' As <- Map(function(p, q) matrix(rnorm(p * q), p, q), dimA, dimX)
|
||||||
|
#' Bs <- Map(function(p, q) matrix(rnorm(p * q), p, q), dimB, dimA)
|
||||||
|
#' # (X x {A1, A2, A3, A4}) x {B1, B2, B3, B4} = X x {B1 A1, B2 A2, B3 A3, B4 A4}
|
||||||
|
#' all.equal(mlm(mlm(X, As), Bs), mlm(X, Map(`%*%`, Bs, As)))
|
||||||
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
mlm <- function(A, B, ..., modes = seq_along(B)) {
|
mlm <- function(A, B, ..., modes = seq_along(B)) {
|
||||||
# Collect all matrices in `B`
|
# Collect all matrices in `B`
|
||||||
|
|
|
@ -1,78 +0,0 @@
|
||||||
#' Kronecker decomposed Variance Matrix estimation.
|
|
||||||
#'
|
|
||||||
#' @description Computes the kronecker decomposition factors of the variance
|
|
||||||
#' matrix
|
|
||||||
#' \deqn{\var{X} = tr(L)tr(R) (L\otimes R).}
|
|
||||||
#'
|
|
||||||
#' @param X numeric matrix or 3d array.
|
|
||||||
#' @param shape in case of \code{X} being a matrix, this specifies the predictor
|
|
||||||
#' shape, otherwise ignored.
|
|
||||||
#' @param center boolean specifying if \code{X} is centered before computing the
|
|
||||||
#' left/right second moments. This is usefull in the case of allready centered
|
|
||||||
#' data.
|
|
||||||
#'
|
|
||||||
#' @returns List containing
|
|
||||||
#' \describe{
|
|
||||||
#' \item{lhs}{Left Hand Side \eqn{L} of the kronecker decomposed variance matrix}
|
|
||||||
#' \item{rhs}{Right Hand Side \eqn{R} of the kronecker decomposed variance matrix}
|
|
||||||
#' \item{trace}{Scaling factor \eqn{tr(L)tr(R)} for the variance matrix}
|
|
||||||
#' }
|
|
||||||
#'
|
|
||||||
#' @examples
|
|
||||||
#' n <- 503L # nr. of observations
|
|
||||||
#' p <- 32L # first predictor dimension
|
|
||||||
#' q <- 27L # second predictor dimension
|
|
||||||
#' lhs <- 0.5^abs(outer(seq_len(q), seq_len(q), `-`)) # Left Var components
|
|
||||||
#' rhs <- 0.5^abs(outer(seq_len(p), seq_len(p), `-`)) # Right Var components
|
|
||||||
#' X <- rmvnorm(n, sigma = kronecker(lhs, rhs)) # Multivariate normal data
|
|
||||||
#'
|
|
||||||
#' # Estimate kronecker decomposed variance matrix
|
|
||||||
#' dim(X) # c(n, p * q)
|
|
||||||
#' fit <- var.kronecker(X, shape = c(p, q))
|
|
||||||
#'
|
|
||||||
#' # equivalent
|
|
||||||
#' dim(X) <- c(n, p, q)
|
|
||||||
#' fit <- var.kronecker(X)
|
|
||||||
#'
|
|
||||||
#' # Compute complete estimated variance matrix
|
|
||||||
#' varX.hat <- fit$trace^-1 * kronecker(fit$lhs, fit$rhs)
|
|
||||||
#'
|
|
||||||
#' # or its inverse
|
|
||||||
#' varXinv.hat <- fit$trace * kronecker(solve(fit$lhs), solve(fit$rhs))
|
|
||||||
#'
|
|
||||||
var.kronecker <- function(X, shape = dim(X)[-1], center = TRUE) {
|
|
||||||
# Get and check predictor dimensions
|
|
||||||
n <- nrow(X)
|
|
||||||
if (length(dim(X)) == 2L) {
|
|
||||||
stopifnot(ncol(X) == prod(shape[1:2]))
|
|
||||||
p <- as.integer(shape[1]) # Predictor "height"
|
|
||||||
q <- as.integer(shape[2]) # Predictor "width"
|
|
||||||
dim(X) <- c(n, p, q)
|
|
||||||
} else if (length(dim(X)) == 3L) {
|
|
||||||
p <- dim(X)[2]
|
|
||||||
q <- dim(X)[3]
|
|
||||||
} else {
|
|
||||||
stop("'X' must be a matrix or 3-tensor")
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isTRUE(center)) {
|
|
||||||
# Center X; X[, i, j] <- X[, i, j] - mean(X[, i, j])
|
|
||||||
X <- scale(X, scale = FALSE)
|
|
||||||
|
|
||||||
print(range(attr(X, "scaled:center")))
|
|
||||||
|
|
||||||
dim(X) <- c(n, p, q)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Calc left/right side of kronecker structures covariance
|
|
||||||
# var(X) = var.lhs %x% var.rhs
|
|
||||||
var.lhs <- .rowMeans(apply(X, 1, crossprod), q * q, n)
|
|
||||||
dim(var.lhs) <- c(q, q)
|
|
||||||
var.rhs <- .rowMeans(apply(X, 1, tcrossprod), p * p, n)
|
|
||||||
dim(var.rhs) <- c(p, p)
|
|
||||||
|
|
||||||
# Estimate scalling factor tr(var(X)) = tr(var.lhs) tr(var.rhs)
|
|
||||||
trace <- sum(X^2) / n
|
|
||||||
|
|
||||||
list(lhs = var.lhs, rhs = var.rhs, trace = trace)
|
|
||||||
}
|
|
|
@ -9,14 +9,17 @@
|
||||||
/* Tensor Times Matrix a.k.a. Mode Product */
|
/* Tensor Times Matrix a.k.a. Mode Product */
|
||||||
extern SEXP ttm(SEXP A, SEXP X, SEXP mode);
|
extern SEXP ttm(SEXP A, SEXP X, SEXP mode);
|
||||||
|
|
||||||
/* Tensor Mode Covariance e.g. `(1 / n) * A_(m) A_(m)^T` */
|
/* Tensor Mode Crossproduct `A_(m) B_(m)^T` */
|
||||||
extern SEXP mcrossprod(SEXP A, SEXP mode);
|
extern SEXP mcrossprod(SEXP A, SEXP B, SEXP mode);
|
||||||
|
/* Symmetric Tensor Mode Crossproduct `A_(m) A_(m)^T` */
|
||||||
|
extern SEXP mcrossprod_sym(SEXP A, SEXP mode);
|
||||||
|
|
||||||
/* List of registered routines (e.g. C entry points) */
|
/* List of registered routines (a.k.a. C entry points) */
|
||||||
static const R_CallMethodDef CallEntries[] = {
|
static const R_CallMethodDef CallEntries[] = {
|
||||||
// {"FastPOI_C_sub", (DL_FUNC) &FastPOI_C_sub, 5}, // NOT USED
|
// {"FastPOI_C_sub", (DL_FUNC) &FastPOI_C_sub, 5}, // NOT USED
|
||||||
{"C_ttm", (DL_FUNC) &ttm, 3},
|
{"C_ttm", (DL_FUNC) &ttm, 3},
|
||||||
{"C_mcrossprod", (DL_FUNC) &mcrossprod, 2},
|
{"C_mcrossprod", (DL_FUNC) &mcrossprod, 3},
|
||||||
|
{"C_mcrossprod_sym", (DL_FUNC) &mcrossprod_sym, 2},
|
||||||
{NULL, NULL, 0}
|
{NULL, NULL, 0}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -11,16 +11,100 @@
|
||||||
/**
|
/**
|
||||||
* Tensor Mode Crossproduct
|
* Tensor Mode Crossproduct
|
||||||
*
|
*
|
||||||
|
* C = A_(m) t(B_(m))
|
||||||
|
*
|
||||||
|
* For a matrices `A` and `B`, the first mode `mcrossprod(A, B, 1)` is equivalent
|
||||||
|
* to `A %*% t(B)` (`tcrossprod`). On the other hand for mode two
|
||||||
|
* `mcrossprod(A, B, 2)` the equivalence is `t(A) %*% B` (`crossprod`).
|
||||||
|
*
|
||||||
|
* @param A multi-dimensional array
|
||||||
|
* @param B multi-dimensional array
|
||||||
|
* @param m mode index (1-indexed)
|
||||||
|
*/
|
||||||
|
extern SEXP mcrossprod(SEXP A, SEXP B, SEXP m) {
|
||||||
|
// get zero indexed mode
|
||||||
|
int mode = asInteger(m) - 1;
|
||||||
|
|
||||||
|
// get dimension attributes
|
||||||
|
SEXP dimA = getAttrib(A, R_DimSymbol);
|
||||||
|
SEXP dimB = getAttrib(B, R_DimSymbol);
|
||||||
|
|
||||||
|
// validate mode (0-indexed, must be smaller than the tensor order)
|
||||||
|
if (mode < 0 || length(dimA) <= mode || length(dimB) <= mode) {
|
||||||
|
error("Illegal mode");
|
||||||
|
}
|
||||||
|
|
||||||
|
// the strides
|
||||||
|
// `stride[0] <- prod(dim(A)[seq_len(mode - 1)])`
|
||||||
|
// `stride[1] <- dim(A)[mode]`
|
||||||
|
// `stride[2] <- prod(dim(A)[-seq_len(mode)])`
|
||||||
|
// Note: Middle stride is ignored (to be consistent with sym version)
|
||||||
|
int stride[3] = {1, 0, 1};
|
||||||
|
for (int i = 0; i < length(dimA); ++i) {
|
||||||
|
int size = INTEGER(dimA)[i];
|
||||||
|
stride[0] *= (i < mode) ? size : 1;
|
||||||
|
stride[2] *= (i > mode) ? size : 1;
|
||||||
|
// check margin matching of `A` and `B`
|
||||||
|
if (i != mode && size != INTEGER(dimB)[i]) {
|
||||||
|
error("Dimension missmatch");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Result dimensions
|
||||||
|
int nrowC = INTEGER(dimA)[mode];
|
||||||
|
int ncolC = INTEGER(dimB)[mode];
|
||||||
|
|
||||||
|
// create response matrix C
|
||||||
|
SEXP C = PROTECT(allocMatrix(REALSXP, nrowC, ncolC));
|
||||||
|
|
||||||
|
// raw data access pointers
|
||||||
|
double* a = REAL(A);
|
||||||
|
double* b = REAL(B);
|
||||||
|
double* c = REAL(C);
|
||||||
|
|
||||||
|
// employ BLAS dgemm (Double GEneralized Matrix Matrix) operation
|
||||||
|
// (C = alpha op(A) op(A) + beta C, op is the identity of transposition)
|
||||||
|
const double zero = 0.0;
|
||||||
|
const double one = 1.0;
|
||||||
|
if (mode == 0) {
|
||||||
|
// mode 1: special case C = A_(1) B_(1)^T
|
||||||
|
// C = 1 A B^T + 0 C
|
||||||
|
F77_CALL(dgemm)("N", "T", &nrowC, &ncolC, &stride[2],
|
||||||
|
&one, a, &nrowC, b, &ncolC,
|
||||||
|
&zero, c, &nrowC FCONE FCONE);
|
||||||
|
} else {
|
||||||
|
// Other modes writen as accumulated sum of matrix products
|
||||||
|
// initialize C to zero
|
||||||
|
memset(c, 0, nrowC * ncolC * sizeof(double));
|
||||||
|
|
||||||
|
// Sum over all modes > mode
|
||||||
|
for (int i2 = 0; i2 < stride[2]; ++i2) {
|
||||||
|
// C = 1 A^T B + 1 C
|
||||||
|
F77_CALL(dgemm)("T", "N", &nrowC, &ncolC, &stride[0],
|
||||||
|
&one, &a[i2 * stride[0] * nrowC], &stride[0],
|
||||||
|
&b[i2 * stride[0] * ncolC], &stride[0],
|
||||||
|
&one, c, &nrowC FCONE FCONE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// release C to grabage collector
|
||||||
|
UNPROTECT(1);
|
||||||
|
|
||||||
|
return C;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Symmetric Tensor Mode Crossproduct
|
||||||
|
*
|
||||||
* C = A_(m) t(A_(m))
|
* C = A_(m) t(A_(m))
|
||||||
*
|
*
|
||||||
* For a matrix `A`, the first mode is `mcrossprod(A, 1)` equivalent to
|
* For a matrix `A`, the first mode is `mcrossprod_sym(A, 1)` equivalent to
|
||||||
* `A %*% t(A)` (`tcrossprod`). On the other hand for mode two `mcrossprod(A, 2)`
|
* `A %*% t(A)` (`tcrossprod`). On the other hand for mode two
|
||||||
* the equivalence is `t(A) %*% A` (`crossprod`).
|
* `mcrossprod(A, 2)` the equivalence is `t(A) %*% A` (`crossprod`).
|
||||||
*
|
*
|
||||||
* @param A multi-dimensional array
|
* @param A multi-dimensional array
|
||||||
* @param m mode index (1-indexed)
|
* @param m mode index (1-indexed)
|
||||||
*/
|
*/
|
||||||
extern SEXP mcrossprod(SEXP A, SEXP m) {
|
extern SEXP mcrossprod_sym(SEXP A, SEXP m) {
|
||||||
// get zero indexed mode
|
// get zero indexed mode
|
||||||
int mode = asInteger(m) - 1;
|
int mode = asInteger(m) - 1;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue