fix: added second argument to mcrossprod,

add: kpir.mle,
wip: LaTeX
This commit is contained in:
Daniel Kapla 2022-05-27 20:11:48 +02:00
parent 6455d27386
commit 1838da8b3d
13 changed files with 279 additions and 111 deletions

View File

@ -293,13 +293,13 @@ with $\pi$ being a permutation of $p q k r$ elements corresponding to permuting
\section{Kronecker Covariance Structure} \section{Kronecker Covariance Structure}
As before we let the sample model for tensor valued opbservations and responses or oder $r$ be As before we let the sample model for tensor valued opbservations and responses or oder $r$ be
\begin{displaymath} \begin{displaymath}
\ten{X} = \ten{\mu} + \ten{F}\times_{j\in[r]}\alpha_j + \ten{\epsilon} \ten{X} = \ten{\mu} + \ten{F}\times_{j\in[r]}\mat{\alpha}_j + \ten{\epsilon}
\end{displaymath} \end{displaymath}
but the error tensor $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ is Tensor Normal distributed with mean zero and covariance matrices $\mat{\Delta}_1, ..., \mat{\Delta}_r$. but the error tensor $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ is Tensor Normal distributed with mean zero and covariance matrices $\mat{\Delta}_1, ..., \mat{\Delta}_r$.
The sample model for $n$ observations has the same form with an additional sample axis in the last mode of $\ten{X}$ and $\ten{Y}$ with dimensions $p_1\times ...\times p_r\times n$ and $q_1\times ...\times q_r\times n$, respectively. Let the residual tensor be The sample model for $n$ observations has the same form with an additional sample axis in the last mode of $\ten{X}$ and $\ten{Y}$ with dimensions $p_1\times ...\times p_r\times n$ and $q_1\times ...\times q_r\times n$, respectively. Let the residual tensor be
\begin{displaymath} \begin{displaymath}
\ten{R} = \ten{X} - \ten{\mu} + \ten{F}\times_{j\in[r]}\alpha_j. \ten{R} = \ten{X} - \ten{\mu} - \ten{F}\times_{j\in[r]}\mat{\alpha}_j.
\end{displaymath} \end{displaymath}
By the definition of the Tensor Normal, using the notation $p_{\lnot j} = \prod_{k\neq j}p_j$, we get for observations $\ten{X}, \ten{F}$ the log-likelihood in terms of the residuals as By the definition of the Tensor Normal, using the notation $p_{\lnot j} = \prod_{k\neq j}p_j$, we get for observations $\ten{X}, \ten{F}$ the log-likelihood in terms of the residuals as
@ -315,7 +315,7 @@ For deriving the MLE estimates we compute the differential of the log-likelihood
\begin{displaymath} \begin{displaymath}
\d l = \d l =
-\sum_{j = 1}^r \frac{n p_{\lnot j}}{2}\d\log|{\mat{\Delta}}_j| -\sum_{j = 1}^r \frac{n p_{\lnot j}}{2}\d\log|{\mat{\Delta}}_j|
-\frac{1}{2}\sum_{j = 1}^r\langle {\ten{R}}\times_{k\in[r]\backslash j}{\mat{\Delta}}_k^{-1}\times_j\d{\mat{\Delta}}^{-1}_j \rangle -\frac{1}{2}\sum_{j = 1}^r\langle {\ten{R}}\times_{k\in[r]\backslash j}{\mat{\Delta}}_k^{-1}\times_j\d{\mat{\Delta}}^{-1}_j, \ten{R} \rangle
-\langle {\ten{R}}\times_{j\in[r]}{\mat{\Delta}}_j^{-1}, \d{\ten{R}} \rangle. -\langle {\ten{R}}\times_{j\in[r]}{\mat{\Delta}}_j^{-1}, \d{\ten{R}} \rangle.
\end{displaymath} \end{displaymath}
Using $\d\log|\mat{A}| = \tr(\mat{A}^{-1}\d\mat{A})$ and $\d\mat{A}^{-1} = -\mat{A}^{-1}(\d\mat{A})\mat{A}^{-1}$ as well as $\langle\ten{A}, \ten{B}\rangle = \tr(\ten{A}_{(j)}\t{\ten{B}_{(j)}})$ for any $j = 1, ..., r$ we get the differential of the estimated log-likelihood as Using $\d\log|\mat{A}| = \tr(\mat{A}^{-1}\d\mat{A})$ and $\d\mat{A}^{-1} = -\mat{A}^{-1}(\d\mat{A})\mat{A}^{-1}$ as well as $\langle\ten{A}, \ten{B}\rangle = \tr(\ten{A}_{(j)}\t{\ten{B}_{(j)}})$ for any $j = 1, ..., r$ we get the differential of the estimated log-likelihood as
@ -349,14 +349,43 @@ as well as gradients (even though they are not realy used, except in the case of
We continue by substitution of the covariance estimates and get We continue by substitution of the covariance estimates and get
\begin{align*} \begin{align*}
\d\hat{l} &= -\langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \d\ten{R} \rangle \\ \d\hat{l} &= -\langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \d\ten{R} \rangle \\
&= \sum_{j = 1}^r \langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k\times_j\d\widehat{\mat{\alpha}}_j \rangle \\ &= \sum_{j = 1}^r \langle \ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1}, \ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k\times_j\d\widehat{\mat{\alpha}}_j \rangle \\
&= \sum_{j = 1}^r \tr\big( (\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1})_{(j)}}\d\widehat{\mat{\alpha}}_j \big). &= \sum_{j = 1}^r \tr\big( (\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}}\d\widehat{\mat{\alpha}}_j \big).
\end{align*} \end{align*}
Through that the gradient for all the parameter matrices is Through that the gradient for all the parameter matrices is
\begin{align*} \begin{align*}
\nabla_{\widehat{\mat{\alpha}}_j}\hat{l} &= (\ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}} \nabla_{\widehat{\mat{\alpha}}_j}\hat{l} &= (\ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}
\qquad{\color{gray}p_j\times q_j} && j = 1, ..., r. \qquad{\color{gray}p_j\times q_j} && j = 1, ..., r.
\end{align*} \end{align*}
By equating the gradients to zero and expanding $\ten{R}$ we get a system of normal equations
\begin{gather*}
0\overset{!}{=} \nabla_{\widehat{\mat{\alpha}}_j}\hat{l}
= ((\ten{X}-\widehat{\ten{\mu}}-\ten{F}\times_{l\in[r]}\widehat{\mat{\alpha}}_l)\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}} \\
%
((\ten{X}-\widehat{\ten{\mu}})\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}} = (\ten{F}\times_{k\in[r]}\widehat{\mat{\Delta}}_k^{-1}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}
\end{gather*}
with cross dependent solutions for the MLE estimates $\widehat{\mat{\alpha}}$ given by
\begin{displaymath}
\widehat{\mat{\alpha}}_j = \widehat{\mat{\Delta}}_j((\ten{X}-\widehat{\ten{\mu}})\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}[(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\Delta}}_k^{-1}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}]^{-1}.
\end{displaymath}
for $j = 1, ..., r$.
\begin{example}[Simple multivariate case ($r = 1$)]
Lets consider the case of $r = 1$, this is the usual multivariate regression case for vector valued predictors. Here $\mat{F}$ denotes the $n\times q$ centered model matrix and the responses are collected in the $n\times p$ matrix $\mat{X}$ with the $n$ observations in the rows and the variables by columns. Furthermore, let the residual estimate matrix $\mat{R} = \mat{X} - \mat{F}\t{\widehat{\mat{\alpha}}_1}$. In this case the tensor MLE estimates simplify to there well known form
\begin{align*}
\widehat{\mat{\alpha}}_1
&= \widehat{\mat{\Delta}}_1((\ten{X}-\widehat{\ten{\mu}})\times_{k\in[1]}{\widehat{\mat{\Delta}}}_k^{-1})_{(1)}\t{(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(j)}}[(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\Delta}}_k^{-1}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(j)}}]^{-1} \\
&= \widehat{\mat{\Delta}}_1\widehat{\mat{\Delta}}_1^{-1}((\ten{X}-\widehat{\ten{\mu}}))_{(1)}\t{\ten{F}_{(1)}}[\ten{F}_{(1)}\t{\ten{F}_{(1)}}]^{-1} \\
&= \t{\mat{X}}\mat{F}(\t{\mat{F}}\mat{F})^{-1}
\end{align*}
as well as the covariance estimate
\begin{align*}
\widehat{\mat{\Delta}}_1
&= \frac{1}{n}(\ten{R}\times_{k\in\emptyset}{\widehat{\mat{\Delta}}}_k^{-1})_{(1)}\t{\ten{R}_{(1)}} = \frac{1}{n}\t{\mat{R}}\mat{R}
= \frac{1}{n}\t{(\mat{X} - \mat{F}\t{\widehat{\mat{\alpha}}_1})}(\mat{X} - \mat{F}\t{\widehat{\mat{\alpha}}_1}).
\end{align*}
Note that $\ten{F}_{(1)} = \t{\mat{F}}$, $\ten{X}_{(1)} = \t{\mat{X}}$ and $\ten{R}_{(1)} = \t{\mat{R}}$.
\end{example}
\paragraph{Comparison to the general case:} {\color{red} There are two main differences, first the general case has a closed form solution for the gradient due to the explicit nature of the MLE estimate of $\widehat{\mat\Delta}$ compared to the mutually dependent MLE estimates $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$. On the other hand the general case has dramatically bigger dimensions of the covariance matrix ($p q \times p q$) compared to the two Kronecker components with dimensions $q \times q$ and $p \times p$. This means that in the general case there is a huge performance penalty in the dimensions of $\widehat{\mat\Delta}$ while in the other case an extra estimation is required to determine $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$.} \paragraph{Comparison to the general case:} {\color{red} There are two main differences, first the general case has a closed form solution for the gradient due to the explicit nature of the MLE estimate of $\widehat{\mat\Delta}$ compared to the mutually dependent MLE estimates $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$. On the other hand the general case has dramatically bigger dimensions of the covariance matrix ($p q \times p q$) compared to the two Kronecker components with dimensions $q \times q$ and $p \times p$. This means that in the general case there is a huge performance penalty in the dimensions of $\widehat{\mat\Delta}$ while in the other case an extra estimation is required to determine $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$.}
@ -508,11 +537,11 @@ and therefore the gradients
\section{Thoughts on initial value estimation} \section{Thoughts on initial value estimation}
\todo{This section uses an alternative notation as it already tries to generalize to general multi-dimensional arrays. Furthermore, one of the main differences is that the observation are indexed in the \emph{last} mode. The benefit of this is that the mode product and parameter matrix indices match not only in the population model but also in sample versions.} \todo{This section uses an alternative notation as it already tries to generalize to general multi-dimensional arrays. Furthermore, one of the main differences is that the observation are indexed in the \emph{last} mode. The benefit of this is that the mode product and parameter matrix indices match not only in the population model but also in sample versions.}
Let $\ten{X}, \ten{F}$ be order $r$ tensors of dimensions $p_1\times ... \times p_r$ and $q_1\times ... \times q_r$, respectively. Also denote the error tensor $\epsilon$ of the same order and dimensions as $\ten{X}$. The considered model for the $i$'th observation is Let $\ten{X}_i, \ten{F}_i$ be order $r$ tensors of dimensions $p_1\times ... \times p_r$ and $q_1\times ... \times q_r$, respectively. Also denote the error tensor $\epsilon$ of the same order and dimensions as $\ten{X}_i$. The considered model for the $i$'th observation is
\begin{displaymath} \begin{displaymath}
\ten{X}_i = \ten{\mu} + \ten{F}_i\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}_i \ten{X}_i = \ten{\mu} + \ten{F}_i\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}_i
\end{displaymath} \end{displaymath}
where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}_i\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations
\begin{displaymath} \begin{displaymath}
\ten{X} = \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon} \ten{X} = \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon}
\end{displaymath} \end{displaymath}

View File

@ -18,6 +18,7 @@ export(hopca)
export(kpir.approx) export(kpir.approx)
export(kpir.base) export(kpir.base)
export(kpir.ls) export(kpir.ls)
export(kpir.mle)
export(kpir.momentum) export(kpir.momentum)
export(kpir.new) export(kpir.new)
export(mat) export(mat)

View File

@ -2,11 +2,11 @@
#' #'
#' This is a special case of the "Khatri-Rao Product". #' This is a special case of the "Khatri-Rao Product".
#' #'
#' @seealso \link{\code{rowKronecker}} #' @seealso \code{\link{rowKronecker}}
#' #'
#' @export #' @export
colKronecker <- function(A, B) { colKronecker <- function(A, B) {
sapply(1:ncol(A), function(i) kronecker(A[, i], B[, i])) sapply(seq_len(ncol(A)), function(i) kronecker(A[, i], B[, i]))
} }
#' Row wise kronecker product #' Row wise kronecker product
@ -14,7 +14,7 @@ colKronecker <- function(A, B) {
#' Also known as the "Face-splitting product". This is a special case of the #' Also known as the "Face-splitting product". This is a special case of the
#' "Khatri-Rao Product". #' "Khatri-Rao Product".
#' #'
#' @seealso \link{\code{colKronecker}} #' @seealso \code{\link{colKronecker}}
#' #'
#' @export #' @export
rowKronecker <- function(A, B) { rowKronecker <- function(A, B) {

View File

@ -161,7 +161,7 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2) inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
# backtracking loop # backtracking loop
for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) { for (delta in step.size * 0.618034^seq.int(0L, length = max.line.iter)) {
# Update `alpha` and `beta` (note: add(+), the gradients are already # Update `alpha` and `beta` (note: add(+), the gradients are already
# pointing into the negative slope direction of the loss cause they are # pointing into the negative slope direction of the loss cause they are
# the gradients of the log-likelihood [NOT the negative log-likelihood]) # the gradients of the log-likelihood [NOT the negative log-likelihood])

View File

@ -14,8 +14,6 @@ kpir.ls <- function(X, Fy, max.iter = 20L, sample.axis = 1L,
dims[sample.axis] <- length(Fy) dims[sample.axis] <- length(Fy)
dims dims
}) })
} else {
stopifnot(dim(X)[sample.axis] == dim(Fy)[sample.axis])
} }
# Check dimensions # Check dimensions
stopifnot(length(dim(X)) == length(dim(Fy))) stopifnot(length(dim(X)) == length(dim(Fy)))
@ -60,8 +58,6 @@ kpir.ls <- function(X, Fy, max.iter = 20L, sample.axis = 1L,
Deltas <- Map(mcrossprod, list(R), mode = modes) Deltas <- Map(mcrossprod, list(R), mode = modes)
Deltas <- Map(`*`, 1 / dim(X)[sample.axis], Deltas) Deltas <- Map(`*`, 1 / dim(X)[sample.axis], Deltas)
list( list(alphas = structure(alphas, names = as.character(modes)),
alphas = structure(alphas, names = as.character(modes)), Deltas = structure(Deltas, names = as.character(modes)))
Deltas = structure(Deltas, names = as.character(modes))
)
} }

View File

@ -0,0 +1,90 @@
#' Per mode (axis) MLE
#'
#' @export
kpir.mle <- function(X, Fy, max.iter = 500L, sample.axis = 1L,
logger = NULL #, eps = .Machine$double.eps
) {
### Step 0: Setup/Initialization
if (!is.array(Fy)) {
# scalar response case (add new axis of size 1)
dim(Fy) <- ifelse(seq_along(dim(X)) == sample.axis, dim(X)[sample.axis], 1L)
}
# Check dimensions and matching of axis (tensor order)
stopifnot(exprs = {
length(dim(X)) == length(dim(Fy))
dim(X)[sample.axis] == dim(Fy)[sample.axis]
})
# warn about model constraints
if (any(dim(Fy)[-sample.axis] >= dim(X)[-sample.axis])) {
warning("Degenerate case 'any(dim(Fy)[-sample.axis] >= dim(X)[-sample.axis])'")
}
# extract dimensions (for easier handling as local variables)
modes <- seq_along(dim(X))[-sample.axis] # predictor axis indices
n <- dim(X)[sample.axis] # observation count (scalar)
p <- dim(X)[-sample.axis] # predictor dimensions (vector)
# q <- dim(Fy)[-sample.axis] # response dimensions (vector)
# r <- length(dim(X)) - 1L # tensor order (scalar)
# Means for X and Fy (a.k.a. sum elements over the sample axis)
meanX <- apply(X, modes, mean, simplify = TRUE)
meanFy <- apply(Fy, modes, mean, simplify = TRUE)
# Center both X and Fy
X <- sweep(X, modes, meanX)
Fy <- sweep(Fy, modes, meanFy)
### Step 1: Initial value estimation
alphas <- Map(function(mode, ncol) {
La.svd(mcrossprod(X, mode = mode), ncol)$u
}, modes, dim(Fy)[modes])
# Residuals
R <- X - mlm(Fy, alphas, modes = modes)
# Covariance Moment estimates
Deltas <- Map(mcrossprod, list(R), mode = modes)
Deltas <- Map(function(Delta, j) (n * prod(p[-j]))^(-1) * Delta,
Deltas, seq_along(Deltas))
# Call history callback (logger) before the first iteration
if (is.function(logger)) { do.call(logger, c(0L, NA, alphas, Deltas)) }
### Step 2: Alternating estimate updates
for (iter in seq_len(max.iter)) {
# Compute covariance inverses
Deltas.inv <- Map(solve, Deltas)
# "standardize" X
Z <- mlm(X, Deltas.inv, modes = modes)
# Compute new alpha estimates
alphas <- Map(function(j) {
# MLE estimate for alpha_j | alpha_k, Delta_l for all k != j and l
FF <- mlm(Fy, alphas[-j], modes = modes[-j])
Deltas[[j]] %*% t(solve(
t(mcrossprod(mlm(FF, Deltas.inv[-j], modes = modes[-j]), FF, mode = modes[j])),
t(mcrossprod(Z, FF, mode = modes[j]))))
}, seq_along(modes))
# update residuals
R <- X - mlm(Fy, alphas, modes = modes)
# next Delta estimates
Deltas <- Map(function(j) {
# MLE estimate for Delta_j | Delta_k, alpha_l for all k != j and l
(n * prod(p[-j]))^(-1) * mcrossprod(
mlm(R, Deltas[-j], modes = modes[-j]), R, mode = modes[j])
}, seq_along(modes))
# TODO: break condition!!!
# Call history callback
if (is.function(logger)) { do.call(logger, c(iter, NA, alphas, Deltas)) }
}
list(alphas = structure(alphas, names = as.character(modes)),
Deltas = structure(Deltas, names = as.character(modes)),
meanX = meanX, meanFy = meanFy)
}

View File

@ -137,7 +137,7 @@ kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2) inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
# backtracking loop # backtracking loop
for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) { for (delta in step.size * 0.618034^seq.int(0L, length = max.line.iter)) {
# Update `alpha` and `beta` (note: add(+), the gradients are already # Update `alpha` and `beta` (note: add(+), the gradients are already
# pointing into the negative slope direction of the loss cause they are # pointing into the negative slope direction of the loss cause they are
# the gradients of the log-likelihood [NOT the negative log-likelihood]) # the gradients of the log-likelihood [NOT the negative log-likelihood])

View File

@ -112,7 +112,7 @@ kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2) inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
# Line Search loop # Line Search loop
for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) { for (delta in step.size * 0.618034^seq.int(0L, length = max.line.iter)) {
# Update `alpha` and `beta` (note: add(+), the gradients are already # Update `alpha` and `beta` (note: add(+), the gradients are already
# pointing into the negative slope direction of the loss cause they are # pointing into the negative slope direction of the loss cause they are
# the gradients of the log-likelihood [NOT the negative log-likelihood]) # the gradients of the log-likelihood [NOT the negative log-likelihood])

View File

@ -14,8 +14,41 @@
#' @note equivalent to \code{tcrossprod(mat(A, mode))} with around the same #' @note equivalent to \code{tcrossprod(mat(A, mode))} with around the same
#' performance but only allocates the result matrix. #' performance but only allocates the result matrix.
#' #'
#' @examples
#' dimA <- c(2, 5, 7, 11)
#' A <- array(rnorm(prod(dimA)), dim = dimA)
#' stopifnot(exprs = {
#' all.equal(mcrossprod(A, mode = 1), tcrossprod(mat(A, 1)))
#' all.equal(mcrossprod(A, , 2), tcrossprod(mat(A, 2)))
#' all.equal(mcrossprod(A, , mode = 3), tcrossprod(mat(A, 3)))
#' all.equal(mcrossprod(A, A, 4), tcrossprod(mat(A, 4)))
#' })
#'
#' dimA <- c(2, 5, 7, 11)
#' dimB <- c(3, 5, 7, 11) # same as dimA except 1. entry
#' A <- array(rnorm(prod(dimA)), dim = dimA)
#' B <- array(rnorm(prod(dimB)), dim = dimB)
#' stopifnot(all.equal(
#' mcrossprod(A, B, 1),
#' tcrossprod(mat(A, 1), mat(B, 1))
#' ))
#'
#' dimA <- c(5, 2, 7, 11)
#' dimB <- c(5, 3, 7, 11) # same as dimA except 2. entry
#' A <- array(rnorm(prod(dimA)), dim = dimA)
#' B <- array(rnorm(prod(dimB)), dim = dimB)
#' stopifnot(all.equal(
#' mcrossprod(A, B, mode = 2L),
#' tcrossprod(mat(A, 2), mat(B, 2))
#' ))
#'
#' @export #' @export
mcrossprod <- function(A, mode = length(dim(A))) { mcrossprod <- function(A, B, mode = length(dim(A))) {
storage.mode(A) <- "double" storage.mode(A) <- "double"
.Call("C_mcrossprod", A, as.integer(mode)) if (missing(B)) {
.Call("C_mcrossprod_sym", A, as.integer(mode))
} else {
storage.mode(B) <- "double"
.Call("C_mcrossprod", A, B, as.integer(mode))
}
} }

View File

@ -23,7 +23,7 @@
#' stopifnot(all.equal(C1, C4)) #' stopifnot(all.equal(C1, C4))
#' #'
#' # selected modes #' # selected modes
#' C1 <- mlm(A, B, modes = 2:3) #' C1 <- mlm(A, B[2:3], modes = 2:3)
#' C2 <- mlm(A, B[[2]], B[[3]], modes = 2:3) #' C2 <- mlm(A, B[[2]], B[[3]], modes = 2:3)
#' C3 <- ttm(ttm(A, B[[2]], 2), B[[3]], 3) #' C3 <- ttm(ttm(A, B[[2]], 2), B[[3]], 3)
#' stopifnot(all.equal(C1, C2)) #' stopifnot(all.equal(C1, C2))
@ -32,7 +32,7 @@
#' # analog to matrix multiplication #' # analog to matrix multiplication
#' A <- matrix(rnorm( 6), 2, 3) #' A <- matrix(rnorm( 6), 2, 3)
#' B <- matrix(rnorm(12), 3, 4) #' B <- matrix(rnorm(12), 3, 4)
#' C <- matrix(rnorm(20), 4, 5) #' C <- matrix(rnorm(20), 5, 4)
#' stopifnot(all.equal( #' stopifnot(all.equal(
#' A %*% B %*% t(C), #' A %*% B %*% t(C),
#' mlm(B, list(A, C)) #' mlm(B, list(A, C))
@ -56,6 +56,16 @@
#' stopifnot(all.equal(P1, P2)) #' stopifnot(all.equal(P1, P2))
#' stopifnot(all.equal(P1, P3)) #' stopifnot(all.equal(P1, P3))
#' #'
#' Concatination of MLM is MLM
#' dimX <- c(4, 8, 6, 3)
#' dimA <- c(3, 17, 19, 2)
#' dimB <- c(7, 11, 13, 5)
#' X <- array(rnorm(prod(dimX)), dim = dimX)
#' As <- Map(function(p, q) matrix(rnorm(p * q), p, q), dimA, dimX)
#' Bs <- Map(function(p, q) matrix(rnorm(p * q), p, q), dimB, dimA)
#' # (X x {A1, A2, A3, A4}) x {B1, B2, B3, B4} = X x {B1 A1, B2 A2, B3 A3, B4 A4}
#' all.equal(mlm(mlm(X, As), Bs), mlm(X, Map(`%*%`, Bs, As)))
#'
#' @export #' @export
mlm <- function(A, B, ..., modes = seq_along(B)) { mlm <- function(A, B, ..., modes = seq_along(B)) {
# Collect all matrices in `B` # Collect all matrices in `B`

View File

@ -1,78 +0,0 @@
#' Kronecker decomposed Variance Matrix estimation.
#'
#' @description Computes the kronecker decomposition factors of the variance
#' matrix
#' \deqn{\var{X} = tr(L)tr(R) (L\otimes R).}
#'
#' @param X numeric matrix or 3d array.
#' @param shape in case of \code{X} being a matrix, this specifies the predictor
#' shape, otherwise ignored.
#' @param center boolean specifying if \code{X} is centered before computing the
#' left/right second moments. This is usefull in the case of allready centered
#' data.
#'
#' @returns List containing
#' \describe{
#' \item{lhs}{Left Hand Side \eqn{L} of the kronecker decomposed variance matrix}
#' \item{rhs}{Right Hand Side \eqn{R} of the kronecker decomposed variance matrix}
#' \item{trace}{Scaling factor \eqn{tr(L)tr(R)} for the variance matrix}
#' }
#'
#' @examples
#' n <- 503L # nr. of observations
#' p <- 32L # first predictor dimension
#' q <- 27L # second predictor dimension
#' lhs <- 0.5^abs(outer(seq_len(q), seq_len(q), `-`)) # Left Var components
#' rhs <- 0.5^abs(outer(seq_len(p), seq_len(p), `-`)) # Right Var components
#' X <- rmvnorm(n, sigma = kronecker(lhs, rhs)) # Multivariate normal data
#'
#' # Estimate kronecker decomposed variance matrix
#' dim(X) # c(n, p * q)
#' fit <- var.kronecker(X, shape = c(p, q))
#'
#' # equivalent
#' dim(X) <- c(n, p, q)
#' fit <- var.kronecker(X)
#'
#' # Compute complete estimated variance matrix
#' varX.hat <- fit$trace^-1 * kronecker(fit$lhs, fit$rhs)
#'
#' # or its inverse
#' varXinv.hat <- fit$trace * kronecker(solve(fit$lhs), solve(fit$rhs))
#'
var.kronecker <- function(X, shape = dim(X)[-1], center = TRUE) {
# Get and check predictor dimensions
n <- nrow(X)
if (length(dim(X)) == 2L) {
stopifnot(ncol(X) == prod(shape[1:2]))
p <- as.integer(shape[1]) # Predictor "height"
q <- as.integer(shape[2]) # Predictor "width"
dim(X) <- c(n, p, q)
} else if (length(dim(X)) == 3L) {
p <- dim(X)[2]
q <- dim(X)[3]
} else {
stop("'X' must be a matrix or 3-tensor")
}
if (isTRUE(center)) {
# Center X; X[, i, j] <- X[, i, j] - mean(X[, i, j])
X <- scale(X, scale = FALSE)
print(range(attr(X, "scaled:center")))
dim(X) <- c(n, p, q)
}
# Calc left/right side of kronecker structures covariance
# var(X) = var.lhs %x% var.rhs
var.lhs <- .rowMeans(apply(X, 1, crossprod), q * q, n)
dim(var.lhs) <- c(q, q)
var.rhs <- .rowMeans(apply(X, 1, tcrossprod), p * p, n)
dim(var.rhs) <- c(p, p)
# Estimate scalling factor tr(var(X)) = tr(var.lhs) tr(var.rhs)
trace <- sum(X^2) / n
list(lhs = var.lhs, rhs = var.rhs, trace = trace)
}

View File

@ -9,14 +9,17 @@
/* Tensor Times Matrix a.k.a. Mode Product */ /* Tensor Times Matrix a.k.a. Mode Product */
extern SEXP ttm(SEXP A, SEXP X, SEXP mode); extern SEXP ttm(SEXP A, SEXP X, SEXP mode);
/* Tensor Mode Covariance e.g. `(1 / n) * A_(m) A_(m)^T` */ /* Tensor Mode Crossproduct `A_(m) B_(m)^T` */
extern SEXP mcrossprod(SEXP A, SEXP mode); extern SEXP mcrossprod(SEXP A, SEXP B, SEXP mode);
/* Symmetric Tensor Mode Crossproduct `A_(m) A_(m)^T` */
extern SEXP mcrossprod_sym(SEXP A, SEXP mode);
/* List of registered routines (e.g. C entry points) */ /* List of registered routines (a.k.a. C entry points) */
static const R_CallMethodDef CallEntries[] = { static const R_CallMethodDef CallEntries[] = {
// {"FastPOI_C_sub", (DL_FUNC) &FastPOI_C_sub, 5}, // NOT USED // {"FastPOI_C_sub", (DL_FUNC) &FastPOI_C_sub, 5}, // NOT USED
{"C_ttm", (DL_FUNC) &ttm, 3}, {"C_ttm", (DL_FUNC) &ttm, 3},
{"C_mcrossprod", (DL_FUNC) &mcrossprod, 2}, {"C_mcrossprod", (DL_FUNC) &mcrossprod, 3},
{"C_mcrossprod_sym", (DL_FUNC) &mcrossprod_sym, 2},
{NULL, NULL, 0} {NULL, NULL, 0}
}; };

View File

@ -11,16 +11,100 @@
/** /**
* Tensor Mode Crossproduct * Tensor Mode Crossproduct
* *
* C = A_(m) t(B_(m))
*
* For a matrices `A` and `B`, the first mode `mcrossprod(A, B, 1)` is equivalent
* to `A %*% t(B)` (`tcrossprod`). On the other hand for mode two
* `mcrossprod(A, B, 2)` the equivalence is `t(A) %*% B` (`crossprod`).
*
* @param A multi-dimensional array
* @param B multi-dimensional array
* @param m mode index (1-indexed)
*/
extern SEXP mcrossprod(SEXP A, SEXP B, SEXP m) {
// get zero indexed mode
int mode = asInteger(m) - 1;
// get dimension attributes
SEXP dimA = getAttrib(A, R_DimSymbol);
SEXP dimB = getAttrib(B, R_DimSymbol);
// validate mode (0-indexed, must be smaller than the tensor order)
if (mode < 0 || length(dimA) <= mode || length(dimB) <= mode) {
error("Illegal mode");
}
// the strides
// `stride[0] <- prod(dim(A)[seq_len(mode - 1)])`
// `stride[1] <- dim(A)[mode]`
// `stride[2] <- prod(dim(A)[-seq_len(mode)])`
// Note: Middle stride is ignored (to be consistent with sym version)
int stride[3] = {1, 0, 1};
for (int i = 0; i < length(dimA); ++i) {
int size = INTEGER(dimA)[i];
stride[0] *= (i < mode) ? size : 1;
stride[2] *= (i > mode) ? size : 1;
// check margin matching of `A` and `B`
if (i != mode && size != INTEGER(dimB)[i]) {
error("Dimension missmatch");
}
}
// Result dimensions
int nrowC = INTEGER(dimA)[mode];
int ncolC = INTEGER(dimB)[mode];
// create response matrix C
SEXP C = PROTECT(allocMatrix(REALSXP, nrowC, ncolC));
// raw data access pointers
double* a = REAL(A);
double* b = REAL(B);
double* c = REAL(C);
// employ BLAS dgemm (Double GEneralized Matrix Matrix) operation
// (C = alpha op(A) op(A) + beta C, op is the identity of transposition)
const double zero = 0.0;
const double one = 1.0;
if (mode == 0) {
// mode 1: special case C = A_(1) B_(1)^T
// C = 1 A B^T + 0 C
F77_CALL(dgemm)("N", "T", &nrowC, &ncolC, &stride[2],
&one, a, &nrowC, b, &ncolC,
&zero, c, &nrowC FCONE FCONE);
} else {
// Other modes writen as accumulated sum of matrix products
// initialize C to zero
memset(c, 0, nrowC * ncolC * sizeof(double));
// Sum over all modes > mode
for (int i2 = 0; i2 < stride[2]; ++i2) {
// C = 1 A^T B + 1 C
F77_CALL(dgemm)("T", "N", &nrowC, &ncolC, &stride[0],
&one, &a[i2 * stride[0] * nrowC], &stride[0],
&b[i2 * stride[0] * ncolC], &stride[0],
&one, c, &nrowC FCONE FCONE);
}
}
// release C to grabage collector
UNPROTECT(1);
return C;
}
/**
* Symmetric Tensor Mode Crossproduct
*
* C = A_(m) t(A_(m)) * C = A_(m) t(A_(m))
* *
* For a matrix `A`, the first mode is `mcrossprod(A, 1)` equivalent to * For a matrix `A`, the first mode is `mcrossprod_sym(A, 1)` equivalent to
* `A %*% t(A)` (`tcrossprod`). On the other hand for mode two `mcrossprod(A, 2)` * `A %*% t(A)` (`tcrossprod`). On the other hand for mode two
* the equivalence is `t(A) %*% A` (`crossprod`). * `mcrossprod(A, 2)` the equivalence is `t(A) %*% A` (`crossprod`).
* *
* @param A multi-dimensional array * @param A multi-dimensional array
* @param m mode index (1-indexed) * @param m mode index (1-indexed)
*/ */
extern SEXP mcrossprod(SEXP A, SEXP m) { extern SEXP mcrossprod_sym(SEXP A, SEXP m) {
// get zero indexed mode // get zero indexed mode
int mode = asInteger(m) - 1; int mode = asInteger(m) - 1;