diff --git a/.gitignore b/.gitignore index 950bc93..5768bbd 100644 --- a/.gitignore +++ b/.gitignore @@ -95,3 +95,12 @@ wip/ # PDFs *.pdf + +# LaTeX (ignore everything except *.tex and *.bib files) +**/LaTeX/* +!**/LaTeX/*.tex +!**/LaTeX/*.bib +**/LaTeX/*-blx.bib + +mlda_analysis/ +References/ diff --git a/LaTeX/main.tex b/LaTeX/main.tex index 98b396f..d0bda6c 100644 --- a/LaTeX/main.tex +++ b/LaTeX/main.tex @@ -87,7 +87,7 @@ We start with a brief summary of the used notation. \todo{write this} -Let $\ten{A}$ be a order (rank) $r$ tensor of dimensions $p_1\times ... \times p_r$ and the matrices $\mat{B}_i$ of dimensions $q_i\times p_i$ for $i = 1, ..., r$, then +Let $\ten{A}$ be a multi-dimensional array of order (rank) $r$ with dimensions $p_1\times ... \times p_r$ and the matrices $\mat{B}_i$ of dimensions $q_i\times p_i$ for $i = 1, ..., r$, then \begin{displaymath} \ten{A} \ttm[1] \mat{B}_1 \ttm[2] \ldots \ttm[r] \mat{B}_r = \ten{A}\times\{ \mat{B}_1, ..., \mat{B}_r \} @@ -110,6 +110,59 @@ Another example \todo{continue} +\section{Tensor Normal Distribution} +Let $\ten{X}$ be a multi-dimensional array random variable of order (rank) $r$ with dimensions $p_1\times ... \times p_r$ written as +\begin{displaymath} + \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r). +\end{displaymath} +Its density is given by +\begin{displaymath} + f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1} + \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right) +\end{displaymath} +where $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution +\begin{displaymath} + \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1) +\end{displaymath} +with $p = \prod_{i = 1}^r p_i$. + + + +\begin{theorem}[Tensor Normal to Multi-Variate Normal equivalence] + For a multi-dimensional random variable $\ten{X}$ of order $r$ with dimensions $p_1\times ..., p_r$. Let $\ten{\mu}$ be the mean of the same order and dimensions as $\ten{X}$ and the mode covariance matrices $\mat{\Delta}_i$ of dimensions $p_i\times p_i$ for $i = 1, ..., n$. Then the tensor normal distribution is equivalent to the multi-variate normal distribution by the relation + \begin{displaymath} + \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r) + \qquad\Leftrightarrow\qquad + \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes ...\otimes \mat{\Delta}_1) + \end{displaymath} + where $p = \prod_{i = 1}^r p_i$. +\end{theorem} +\begin{proof} + A straight forward way is to rewrite the Tensor Normal density as the density of a Multi-Variate Normal distribution depending on the vectorization of $\ten{X}$. First consider + \begin{align*} + \langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle + &= \t{\vec(\ten{X} - \mu)}\vec((\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\}) \\ + &= \t{\vec(\ten{X} - \mu)}(\mat{\Delta}_r^{-1}\otimes ...\otimes\mat{\Delta}_1^{-1})\vec(\ten{X} - \mu) \\ + &= \t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu). + \end{align*} + Next, using a property of the determinant of a Kronecker product $|\mat{\Delta}_1\otimes\mat{\Delta}_2| = |\mat{\Delta}_1|^{p_2}|\mat{\Delta}_2|^{p_1}$ yields + \begin{displaymath} + |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1| + = |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_2|^{p_1}|\mat{\Delta}_1|^{q_1} + \end{displaymath} + where $q_i = \prod_{j \neq i}p_j$. By induction over $r$ the relation + \begin{displaymath} + |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1| + = \prod_{i = 1}^r |\mat{\Delta}_i|^{q_i} + \end{displaymath} + holds for arbitrary order $r$. Substituting into the Tensor Normal density leads to + \begin{align*} + f(\ten{X}) = \Big( (2\pi)^p |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1| \Big)^{-1/2} + \exp\!\left( -\frac{1}{2}\t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu) \right) + \end{align*} + which is the Multi-Variate Normal density of the $p$ dimensional vector $\vec\ten{X}$. +\end{proof} + \section{Introduction} We assume the model diff --git a/simulations/kpir_sim.R b/simulations/kpir_sim.R index e15de28..4eca504 100644 --- a/simulations/kpir_sim.R +++ b/simulations/kpir_sim.R @@ -13,7 +13,7 @@ log.prog <- function(max.iter) { ### Exec all methods for a given data set and collect logs ### -sim <- function(X, Fy, shape, alpha.true, beta.true, max.iter = 500L) { +sim <- function(X, Fy, alpha.true, beta.true, max.iter = 500L) { # Logger creator logger <- function(name) { @@ -26,7 +26,8 @@ sim <- function(X, Fy, shape, alpha.true, beta.true, max.iter = 500L) { dist.alpha = (dist.alpha <- dist.subspace(c(alpha.true), c(alpha))), dist.beta = (dist.beta <- dist.subspace(c( beta.true), c(beta ))), norm.alpha = norm(alpha, "F"), - norm.beta = norm(beta, "F") + norm.beta = norm(beta, "F"), + mse = mean((X - mlm(Fy, alpha, beta, modes = 3:2))^2) ) cat(sprintf( @@ -39,36 +40,36 @@ sim <- function(X, Fy, shape, alpha.true, beta.true, max.iter = 500L) { } # Initialize logger history targets - hist.base <- hist.new <- hist.momentum <- hist.approx <- # hist.kron <- + hist.base <- hist.new <- hist.momentum <- hist.approx <- hist.ls <- data.frame(iter = seq(0L, max.iter), loss = NA, dist = NA, dist.alpha = NA, dist.beta = NA, - norm.alpha = NA, norm.beta = NA + norm.alpha = NA, norm.beta = NA, mse = NA ) # Base (old) - kpir.base(X, Fy, shape, max.iter = max.iter, logger = logger("base")) + kpir.base(X, Fy, max.iter = max.iter, logger = logger("base")) # New (simple Gradient Descent) - kpir.new(X, Fy, shape, max.iter = max.iter, logger = logger("new")) + kpir.new(X, Fy, max.iter = max.iter, logger = logger("new")) + + # Least Squares estimate (alternating estimation) + kpir.ls(X, Fy, sample.mode = 1L, max.iter = max.iter, logger = logger("ls")) # Gradient Descent with Nesterov Momentum - kpir.momentum(X, Fy, shape, max.iter = max.iter, logger = logger("momentum")) - - # # Residual Covariance Kronecker product assumpton version - # kpir.kron(X, Fy, shape, max.iter = max.iter, logger = logger("kron")) + kpir.momentum(X, Fy, max.iter = max.iter, logger = logger("momentum")) # Approximated MLE with Nesterov Momentum - kpir.approx(X, Fy, shape, max.iter = max.iter, logger = logger("approx")) + kpir.approx(X, Fy, max.iter = max.iter, logger = logger("approx")) # Add method tags hist.base$method <- factor("base") hist.new$method <- factor("new") + hist.ls$method <- factor("ls") hist.momentum$method <- factor("momentum") - # hist.kron$method <- factor("kron") hist.approx$method <- factor("approx") # Combine results and return - rbind(hist.base, hist.new, hist.momentum, hist.approx) #, hist.kron + rbind(hist.base, hist.new, hist.momentum, hist.approx, hist.ls) } ## Plot helper function @@ -107,10 +108,10 @@ plot.hist2 <- function(hist, response, type = "all", ...) { ## Generate some test data / DEBUG n <- 200 # Sample Size -p <- sample(1:15, 1) # 11 -q <- sample(1:15, 1) # 3 -k <- sample(1:15, 1) # 7 -r <- sample(1:15, 1) # 5 +p <- sample(2:15, 1) # 11 +q <- sample(2:15, 1) # 7 +k <- min(sample(1:15, 1), p - 1) # 3 +r <- min(sample(1:15, 1), q - 1) # 5 print(c(n, p, q, k, r)) hist <- NULL @@ -129,8 +130,10 @@ for (rep in 1:reps) { )) Delta <- 0.5^abs(outer(seq_len(p * q), seq_len(p * q), `-`)) X <- tcrossprod(Fy, kronecker(alpha, beta)) + CVarE:::rmvnorm(n, sigma = Delta) + dim(X) <- c(n, p, q) + dim(Fy) <- c(n, k, r) - hist.sim <- sim(X, Fy, shape = c(p, q, k, r), alpha.true, beta.true) + hist.sim <- sim(X, Fy, alpha.true, beta.true) hist.sim$repetition <- rep hist <- rbind(hist, hist.sim) @@ -167,22 +170,22 @@ for (response in c("loss", "dist", "dist.alpha", "dist.beta")) { n <- 200 # Sample Size p <- 11 # sample(1:15, 1) -q <- 3 # sample(1:15, 1) -k <- 7 # sample(1:15, 1) +q <- 7 # sample(1:15, 1) +k <- 3 # sample(1:15, 1) r <- 5 # sample(1:15, 1) print(c(n, p, q, k, r)) hist <- NULL reps <- 20 +max.iter <- 2 -Delta.1 <- sqrt(0.5)^abs(outer(seq_len(q), seq_len(q), `-`)) -Delta.2 <- sqrt(0.5)^abs(outer(seq_len(p), seq_len(p), `-`)) -Delta <- kronecker(Delta.1, Delta.2) +Delta.1 <- sqrt(0.5)^abs(outer(seq_len(p), seq_len(p), `-`)) +Delta.2 <- sqrt(0.5)^abs(outer(seq_len(q), seq_len(q), `-`)) for (rep in 1:reps) { cat(sprintf("%4d / %d simulation rep. started\n", rep, reps)) - alpha.true <- alpha <- matrix(rnorm(q * r), q, r) - beta.true <- beta <- matrix(rnorm(p * k), p, k) + alpha.1.true <- alpha.1 <- matrix(rnorm(q * r), q, r) + alpha.2.true <- alpha.2 <- matrix(rnorm(p * k), p, k) y <- rnorm(n) Fy <- do.call(cbind, Map(function(slope, offset) { sin(slope * y + offset) @@ -190,15 +193,16 @@ for (rep in 1:reps) { head(rep(seq(1, ceiling(0.5 * k * r)), each = 2), k * r), head(rep(c(0, pi / 2), ceiling(0.5 * k * r)), k * r) )) - X <- tcrossprod(Fy, kronecker(alpha, beta)) + CVarE:::rmvnorm(n, sigma = Delta) + dim(Fy) <- c(n, k, r) + X <- mlm(Fy, alpha, beta, modes = 3:2) + X <- X + rtensornorm(n, 0, Delta.1, Delta.2, sample.mode = 1L) - hist.sim <- sim(X, Fy, shape = c(p, q, k, r), alpha.true, beta.true) + hist.sim <- sim(X, Fy, alpha.true, beta.true, max.iter = max.iter) hist.sim$repetition <- rep hist <- rbind(hist, hist.sim) } - # Save simulation results sim.name <- "sim02" datetime <- format(Sys.time(), "%Y%m%dT%H%M") @@ -207,14 +211,77 @@ saveRDS(hist, file = sprintf("%s_%s.rds", sim.name, datetime)) # for GGPlot2, as factors for grouping hist$repetition <- factor(hist$repetition) -for (response in c("loss", "dist", "dist.alpha", "dist.beta")) { +for (response in c("loss", "mse", "dist", "dist.alpha", "dist.beta")) { for (fun in c("all", "mean", "median")) { print(plot.hist2(hist, response, fun, title = fun) + coord_trans(x = "log1p")) dev.print(png, file = sprintf("%s_%s_%s_%s.png", sim.name, datetime, response, fun), width = 768, height = 768, res = 125) + if (response != "loss") { + print(plot.hist2(hist, response, fun, title = fun) + coord_trans(x = "log1p", y = "log1p")) + dev.print(png, file = sprintf("%s_%s_%s_%s_log.png", sim.name, datetime, response, fun), + width = 768, height = 768, res = 125) + } } } +################################################################################ +### Sim 3 ### +################################################################################ +n <- 200 +p <- c(7, 11, 5) # response dimensions (order 3) +q <- c(3, 6, 2) # predictor dimensions (order 3) + +# currently only kpir.ls suppoert higher orders (order > 2) +sim3 <- function(X, Fy, alphas.true, max.iter = 500L) { + + # Logger creator + logger <- function(name) { + eval(substitute(function(iter, loss, alpha, beta, ...) { + hist[iter + 1L, ] <<- c( + iter = iter, + loss = loss, + mse = (mse <- mean((X - mlm(Fy, alpha, beta, modes = 3:2))^2)), + (dist <- unlist(Map(dist.subspace, alphas, alphas.true))) + ) + + cat(sprintf( + "%s(%3d) | loss: %-12.4f - mse: %-12.4f - sum(dist): %-.4e\n", + name, iter, loss, sum(dist) + )) + }, list(hist = as.symbol(paste0("hist.", name))))) + } + + # Initialize logger history targets + hist.ls <- + do.call(data.frame, c(list( + iter = seq(0, r), loss = NA, mse = NA), + dist = rep(NA, length(dim(X)) - 1L) + )) + + # Approximated MLE with Nesterov Momentum + kpir.ls(X, Fy, sample.mode = 1L, max.iter = max.iter, logger = logger("ls")) + + # Add method tags + hist.ls$method <- factor("ls") + + # # Combine results and return + # rbind(hist.base, hist.new, hist.momentum, hist.approx, hist.ls) + hist.ls +} + +sample.data3 <- function(n, p, q) { + stopifnot(length(p) == length(q)) + stopifnot(all(q <= p)) + + Deltas <- Map(function(nrow) { + + }, p) + + list(X, Fy, alphas, Deltas) +} + + + ################################################################################ ### WIP ### ################################################################################ diff --git a/simulations/simulation_poi.R b/simulations/simulation_poi.R index 49c3873..7123f4c 100644 --- a/simulations/simulation_poi.R +++ b/simulations/simulation_poi.R @@ -1,3 +1,26 @@ +################################################################################ +### Sparce SIR against SIR ### +################################################################################ +devtools::load_all('tensorPredictors/') +library(dr) + +n <- 100 +p <- 10 + +X <- rmvnorm(n, sigma = 0.5^abs(outer(1:p, 1:p, `-`))) +y <- rowSums(X[, 1:3]) + rnorm(n, 0.5) +B <- as.matrix(1:p <= 3) / sqrt(3) + +dr.sir <- dr(y ~ X, method = 'sir', numdir = ncol(B)) +B.sir <- dr.sir$evectors[, seq_len(ncol(B)), drop = FALSE] + +dist.projection(B, B.sir) + +B.poi <- with(GEP(X, y, 'sir'), { + POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = TRUE)$vectors +}) + +dist.projection(B, B.poi) ################################################################################ ### LDA (sparse Linear Discrimina Analysis) ### @@ -69,19 +92,19 @@ dataset <- function(nr) { list(X = X, y = y, B = qr.Q(qr(B))) } -# # Model 1 -# fit <- with(dataset(1), { -# with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C')) -# }) -# fit <- with(dataset(1), { -# with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C')) -# }) -# fit <- with(dataset(1), { -# with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = TRUE)) -# }) -# fit <- with(dataset(1), { -# with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C', use.C = TRUE)) -# }) +# Model 1 +fit <- with(dataset(1), { + with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C')) +}) +fit <- with(dataset(1), { + with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C')) +}) +fit <- with(dataset(1), { + with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = TRUE)) +}) +fit <- with(dataset(1), { + with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C', use.C = TRUE)) +}) # head(fit$vectors, 10) @@ -90,19 +113,24 @@ nr.reps <- 100 sim <- replicate(nr.reps, { res <- double(0) for (model.nr in 1:5) { - for (method in c('POI-C', 'FastPOI-C')) { - for (use.C in c(FALSE, TRUE)) { - dist <- with(dataset(model.nr), { + with(dataset(model.nr), { + for (method in c('POI-C', 'FastPOI-C')) { + for (use.C in c(FALSE, TRUE)) { fit <- with(GEP(X, y, 'lda'), { POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = use.C) }) - # dist.subspace(B, fit$vectors, is.ortho = FALSE, normalize = TRUE) - dist.projection(B, fit$vectors) - }) - names(dist) <- paste('M', model.nr, '-', method, '-', use.C) - res <- c(res, dist) + dist <- dist.projection(B, fit$vectors) + names(dist) <- paste('M', model.nr, '-', method, '-', use.C) + res <<- c(res, dist) + } } - } + fit <- with(GEP(X, y, 'lda'), { + solve.gep(lhs, rhs, ncol(B)) + }) + dist <- dist.projection(B, fit$vectors) + names(dist) <- paste('M', model.nr, '- solve -', use.C) + res <<- c(res, dist) + }) } cat("Counter", (count <<- count + 1), "/", nr.reps, "\n") res diff --git a/tensorPredictors/NAMESPACE b/tensorPredictors/NAMESPACE index b623699..739b05c 100644 --- a/tensorPredictors/NAMESPACE +++ b/tensorPredictors/NAMESPACE @@ -16,12 +16,14 @@ export(dist.subspace) export(kpir.approx) export(kpir.base) export(kpir.kron) +export(kpir.ls) export(kpir.momentum) export(kpir.new) export(mat) export(matpow) export(matrixImage) export(mcrossprod) +export(mlm) export(reduce) export(rowKronecker) export(rtensornorm) diff --git a/tensorPredictors/R/kpir_kron.R b/tensorPredictors/R/kpir_kron.R deleted file mode 100644 index b8be6db..0000000 --- a/tensorPredictors/R/kpir_kron.R +++ /dev/null @@ -1,216 +0,0 @@ -#' Gradient Descent Bases Tensor Predictors method with Nesterov Accelerated -#' Momentum and Kronecker structure assumption for the residual covariance -#' `Delta = Delta.1 %x% Delta.2` (simple plugin version!) -#' -#' @export -kpir.kron <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), - max.iter = 500L, max.line.iter = 50L, step.size = 1e-3, - nesterov.scaling = function(a, t) { 0.5 * (1 + sqrt(1 + (2 * a)^2)) }, - eps = .Machine$double.eps, - logger = NULL -) { - - # Check if X and Fy have same number of observations - stopifnot(nrow(X) == NROW(Fy)) - n <- nrow(X) # Number of observations - - # Get and check predictor dimensions (convert to 3-tensor if needed) - if (length(dim(X)) == 2L) { - stopifnot(!missing(shape)) - stopifnot(ncol(X) == prod(shape[1:2])) - p <- as.integer(shape[1]) # Predictor "height" - q <- as.integer(shape[2]) # Predictor "width" - } else if (length(dim(X)) == 3L) { - p <- dim(X)[2] - q <- dim(X)[3] - } else { - stop("'X' must be a matrix or 3-tensor") - } - - # Get and check response dimensions (and convert to 3-tensor if needed) - if (!is.array(Fy)) { - Fy <- as.array(Fy) - } - if (length(dim(Fy)) == 1L) { - k <- r <- 1L - dim(Fy) <- c(n, 1L, 1L) - } else if (length(dim(Fy)) == 2L) { - stopifnot(!missing(shape)) - stopifnot(ncol(Fy) == prod(shape[3:4])) - k <- as.integer(shape[3]) # Response functional "height" - r <- as.integer(shape[4]) # Response functional "width" - } else if (length(dim(Fy)) == 3L) { - k <- dim(Fy)[2] - r <- dim(Fy)[3] - } else { - stop("'Fy' must be a vector, matrix or 3-tensor") - } - - - ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon` - # Vectorize - dim(Fy) <- c(n, k * r) - dim(X) <- c(n, p * q) - # Solve - cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace - if (n <= k * r || qr(cpFy)$rank < k * r) { - # In case of under-determined system replace the inverse in the normal - # equation by the Moore-Penrose Pseudo Inverse - B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X)) - } else { - # Compute OLS estimate by the Normal Equation - B <- t(solve(cpFy, crossprod(Fy, X))) - } - - # De-Vectroize (from now on tensor arithmetics) - dim(Fy) <- c(n, k, r) - dim(X) <- c(n, p, q) - - # Decompose `B = alpha x beta` into `alpha` and `beta` - c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k)) - - # Compute residuals - resid <- X - (Fy %x_3% alpha0 %x_2% beta0) - - # Covariance estimate - Delta.1 <- tcrossprod(mat(resid, 3)) - Delta.2 <- tcrossprod(mat(resid, 2)) - tr <- sum(diag(Delta.1)) - Delta.1 <- Delta.1 / sqrt(n * tr) - Delta.2 <- Delta.2 / sqrt(n * tr) - - # Transformed Residuals - resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2) - - # Evaluate negative log-likelihood - loss <- 0.5 * (n * (p * log(det(Delta.1)) + q * log(det(Delta.2))) + - sum(resid.trans * resid)) - - # Call history callback (logger) before the first iterate - if (is.function(logger)) { - logger(0L, loss, alpha0, beta0, Delta.1, Delta.2, NA) - } - - - ### Step 2: MLE with LS solution as starting value - a0 <- 0 - a1 <- 1 - alpha1 <- alpha0 - beta1 <- beta0 - - # main descent loop - no.nesterov <- TRUE - break.reason <- NA - for (iter in seq_len(max.iter)) { - if (no.nesterov) { - # without extrapolation as fallback - S.alpha <- alpha1 - S.beta <- beta1 - } else { - # extrapolation using previous direction - S.alpha <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0) - S.beta <- beta1 + ((a0 - 1) / a1) * ( beta1 - beta0) - } - - # Extrapolated Residuals - resid <- X - (Fy %x_3% S.alpha %x_2% S.beta) - - # Covariance Estimates - Delta.1 <- tcrossprod(mat(resid, 3)) - Delta.2 <- tcrossprod(mat(resid, 2)) - tr <- sum(diag(Delta.1)) - Delta.1 <- Delta.1 / sqrt(n * tr) - Delta.2 <- Delta.2 / sqrt(n * tr) - - # Transform Residuals - resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2) - - # Calculate Gradients - grad.alpha <- tcrossprod(mat(resid.trans, 3), mat(Fy %x_2% S.beta, 3)) - grad.beta <- tcrossprod(mat(resid.trans, 2), mat(Fy %x_3% S.alpha, 2)) - - # Backtracking line search (Armijo type) - # The `inner.prod` is used in the Armijo break condition but does not - # depend on the step size. - inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2) - - # backtracking loop - for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) { - # Update `alpha` and `beta` (note: add(+), the gradients are already - # pointing into the negative slope direction of the loss cause they are - # the gradients of the log-likelihood [NOT the negative log-likelihood]) - alpha.temp <- S.alpha + delta * grad.alpha - beta.temp <- S.beta + delta * grad.beta - - # Update Residuals, Covariance and transformed Residuals - resid <- X - (Fy %x_3% alpha.temp %x_2% beta.temp) - Delta.1 <- tcrossprod(mat(resid, 3)) - Delta.2 <- tcrossprod(mat(resid, 2)) - tr <- sum(diag(Delta.1)) - Delta.1 <- Delta.1 / sqrt(n * tr) - Delta.2 <- Delta.2 / sqrt(n * tr) - resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2) - - # Evaluate negative log-likelihood - loss.temp <- 0.5 * (n * (p * log(det(Delta.1)) + q * log(det(Delta.2))) - + sum(resid.trans * resid)) - - # Armijo line search break condition - if (loss.temp <= loss - 0.1 * delta * inner.prod) { - break - } - } - - # Call logger (invoke history callback) - if (is.function(logger)) { - logger(iter, loss.temp, alpha.temp, beta.temp, Delta.1, Delta.2, delta) - } - - # Ensure descent - if (loss.temp < loss) { - alpha0 <- alpha1 - alpha1 <- alpha.temp - beta0 <- beta1 - beta1 <- beta.temp - - # check break conditions (in descent case) - if (mean(abs(alpha1)) + mean(abs(beta1)) < eps) { - break.reason <- "alpha, beta numerically zero" - break # basically, estimates are zero -> stop - } - if (inner.prod < eps * (p * q + r * k)) { - break.reason <- "mean squared gradient is smaller than epsilon" - break # mean squared gradient is smaller than epsilon -> stop - } - if (abs(loss.temp - loss) < eps) { - break.reason <- "decrease is too small (slow)" - break # decrease is too small (slow) -> stop - } - - loss <- loss.temp - no.nesterov <- FALSE # always reset - } else if (!no.nesterov) { - no.nesterov <- TRUE # retry without momentum - next - } else { - break.reason <- "failed even without momentum" - break # failed even without momentum -> stop - } - - # update momentum scaling - a0 <- a1 - a1 <- nesterov.scaling(a1, iter) - - # Set next iter starting step.size to line searched step size - # (while allowing it to encrease) - step.size <- 1.618034 * delta - - } - - list( - loss = loss, - alpha = alpha1, beta = beta1, - Delta.1 = Delta.1, Delta.2 = Delta.2, - break.reason = break.reason - ) -} diff --git a/tensorPredictors/R/kpir_ls.R b/tensorPredictors/R/kpir_ls.R index 871e717..f4b755c 100644 --- a/tensorPredictors/R/kpir_ls.R +++ b/tensorPredictors/R/kpir_ls.R @@ -17,19 +17,28 @@ kpir.ls <- function(X, Fy, max.iter = 20L, sample.mode = 1L, } else { stopifnot(dim(X)[sample.mode] == dim(Fy)[sample.mode]) } - # and check shape - stopifnot(length(X) == length(Fy)) + # Check dimensions + stopifnot(length(dim(X)) == length(dim(Fy))) + stopifnot(dim(X)[sample.mode] == dim(Fy)[sample.mode]) + # and model constraints + stopifnot(all(dim(Fy) <= dim(X))) # mode index sequence (exclude sample mode, a.k.a. observation axis) modes <- seq_along(dim(X))[-sample.mode] + ### Step 1: initial per mode estimates alphas <- Map(function(mode, ncol) { - La.svd(mcrossprod(X, mode), ncol)$u + La.svd(mcrossprod(X, mode = mode), ncol)$u }, modes, dim(Fy)[modes]) + # # Scaling of alpha, such that `tr(alpha_i' alpha_i) = tr(alpha_j' alpha_j)`` + # # for `i, j = 1, ..., r`. + # traces <- unlist(Map(function(alpha) sum(alpha^2))) + # alphas <- Map(`*`, prod(traces)^(1 / length(alphas)) / traces, alphas) + # Call history callback (logger) before the first iteration - if (is.function(logger)) { logger(0L, alphas) } + if (is.function(logger)) { do.call(logger, c(0L, NA, rev(alphas))) } ### Step 2: iterate per mode (axis) least squares estimates @@ -38,16 +47,31 @@ kpir.ls <- function(X, Fy, max.iter = 20L, sample.mode = 1L, for (j in seq_along(modes)) { # least squares solution for `alpha_j | alpha_i, i != j` Z <- mlm(Fy, alphas[-j], modes = modes[-j]) - alphas[[j]] <- t(solve(mcrossprod(Z, j), tcrossprod(mat(Z, j), mat(X, j)))) + alphas[[j]] <- t(solve(mcrossprod(Z, mode = modes[j]), + tcrossprod(mat(Z, modes[j]), mat(X, modes[j])))) # TODO: alphas[[j]] <- t(solve(mcrossprod(Z, j), mcrossprod(Z, X, j))) } + # # Scaling of alpha, such that `tr(alpha_i' alpha_i) = tr(alpha_j' alpha_j)`` + # # for `i, j = 1, ..., r`. + # traces <- unlist(Map(function(alpha) sum(alpha^2))) + # alphas <- Map(`*`, prod(traces)^(1 / length(alphas)) / traces, alphas) + # Call logger (invoke history callback) - if (is.function(logger)) { logger(iter, alphas) } + if (is.function(logger)) { do.call(logger, c(iter, NA, rev(alphas))) } # TODO: add some kind of break condition } + ### Step 3: Moment estimates for `Delta_i` + # Residuals + R <- X - mlm(Fy, alphas, modes = modes) + # Moment estimates for `Delta_i`s + Deltas <- Map(mcrossprod, list(R), mode = modes) + Deltas <- Map(`*`, 1 / dim(X)[sample.mode], Deltas) + + list( + alphas = structure(alphas, names = as.character(modes)), + Deltas = structure(Deltas, names = as.character(modes)) + ) } - -