#' Using unbiased (but not MLE) estimates for the Kronecker decomposed #' covariance matrices Delta_1, Delta_2 for approximating the log-likelihood #' giving a closed form solution for the gradient. #' #' Delta_1 = n^-1 sum_i R_i' R_i, #' Delta_2 = n^-1 sum_i R_i R_i'. #' #' @export kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), max.iter = 500L, max.line.iter = 50L, step.size = 1e-3, nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)), max.init.iter = 20L, init.method = c("ls", "vlp"), eps = .Machine$double.eps, logger = NULL ) { # Check if X and Fy have same number of observations stopifnot(nrow(X) == NROW(Fy)) n <- nrow(X) # Number of observations # Check predictor dimensions if (length(dim(X)) == 2L) { stopifnot(!missing(shape)) stopifnot(ncol(X) == prod(shape[1:2])) p <- as.integer(shape[1]) # Predictor "height" q <- as.integer(shape[2]) # Predictor "width" } else if (length(dim(X)) == 3L) { p <- dim(X)[2] q <- dim(X)[3] } else { stop("'X' must be a matrix or 3-tensor") } # Check response dimensions if (!is.array(Fy)) { Fy <- as.array(Fy) } if (length(dim(Fy)) == 1L) { k <- r <- 1L } else if (length(dim(Fy)) == 2L) { stopifnot(!missing(shape)) stopifnot(ncol(Fy) == prod(shape[3:4])) k <- as.integer(shape[3]) # Response functional "height" r <- as.integer(shape[4]) # Response functional "width" } else if (length(dim(Fy)) == 3L) { k <- dim(Fy)[2] r <- dim(Fy)[3] } else { stop("'Fy' must be a vector, matrix or 3-tensor") } ### Step 1: (Approx) Least Squares initial estimate init.method <- match.arg(init.method) if (init.method == "ls") { # De-Vectroize (from now on tensor arithmetics) dim(Fy) <- c(n, k, r) dim(X) <- c(n, p, q) ls <- kpir.ls(X, Fy, max.iter = max.init.iter, sample.axis = 1L, eps = eps) c(beta0, alpha0) %<-% ls$alphas } else { # Van Loan and Pitsianis # Vectorize dim(Fy) <- c(n, k * r) dim(X) <- c(n, p * q) # solution for `X = Fy B' + epsilon` cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace if (n <= k * r || qr(cpFy)$rank < k * r) { # In case of under-determined system replace the inverse in the normal # equation by the Moore-Penrose Pseudo Inverse B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X)) } else { # Compute OLS estimate by the Normal Equation B <- t(solve(cpFy, crossprod(Fy, X))) } # De-Vectroize (from now on tensor arithmetics) dim(Fy) <- c(n, k, r) dim(X) <- c(n, p, q) # Decompose `B = alpha x beta` into `alpha` and `beta` c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k)) } # Compute residuals R <- X - (Fy %x_3% alpha0 %x_2% beta0) # Covariance estimates and scaling factor Delta.1 <- tcrossprod(mat(R, 3)) / n Delta.2 <- tcrossprod(mat(R, 2)) / n s <- mean(diag(Delta.1)) # Inverse Covariances Delta.1.inv <- solve(Delta.1) Delta.2.inv <- solve(Delta.2) # cross dependent covariance estimates S.1 <- tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) / n S.2 <- tcrossprod(mat(R, 2), mat(R %x_3% Delta.1.inv, 2)) / n # Evaluate negative log-likelihood (2 pi term dropped) loss <- -0.5 * (n * (p * q * log(s) - p * log(det(Delta.1)) - q * log(det(Delta.2))) - s * sum(S.1 * Delta.1.inv)) # Call history callback (logger) before the first iteration if (is.function(logger)) { logger(0L, loss, alpha0, beta0, Delta.1, Delta.2, NA) } ### Step 2: MLE estimate with LS solution as starting value a0 <- 0 a1 <- 1 alpha1 <- alpha0 beta1 <- beta0 # main descent loop no.nesterov <- TRUE break.reason <- NA for (iter in seq_len(max.iter)) { if (no.nesterov) { # without extrapolation as fallback alpha.moment <- alpha1 beta.moment <- beta1 } else { # extrapolation using previous direction alpha.moment <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0) beta.moment <- beta1 + ((a0 - 1) / a1) * ( beta1 - beta0) } # Extrapolated residuals R <- X - (Fy %x_3% alpha.moment %x_2% beta.moment) # Recompute Covariance Estimates and scaling factor Delta.1 <- tcrossprod(mat(R, 3)) / n Delta.2 <- tcrossprod(mat(R, 2)) / n s <- mean(diag(Delta.1)) # Inverse Covariances Delta.1.inv <- solve(Delta.1) Delta.2.inv <- solve(Delta.2) # cross dependent covariance estimates S.1 <- tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) / n S.2 <- tcrossprod(mat(R, 2), mat(R %x_3% Delta.1.inv, 2)) / n # Gradient "generating" tensor G <- (sum(S.1 * Delta.1.inv) - p * q / s) * R G <- G + R %x_2% ((diag(q, p) - s * (Delta.2.inv %*% S.2)) %*% Delta.2.inv) G <- G + R %x_3% ((diag(p, q) - s * (Delta.1.inv %*% S.1)) %*% Delta.1.inv) G <- G + s * (R %x_2% Delta.2.inv %x_3% Delta.1.inv) # Calculate Gradients grad.alpha <- tcrossprod(mat(G, 3), mat(Fy %x_2% beta.moment, 3)) grad.beta <- tcrossprod(mat(G, 2), mat(Fy %x_3% alpha.moment, 2)) # Backtracking line search (Armijo type) # The `inner.prod` is used in the Armijo break condition but does not # depend on the step size. inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2) # backtracking loop for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) { # Update `alpha` and `beta` (note: add(+), the gradients are already # pointing into the negative slope direction of the loss cause they are # the gradients of the log-likelihood [NOT the negative log-likelihood]) alpha.temp <- alpha.moment + delta * grad.alpha beta.temp <- beta.moment + delta * grad.beta # Update Residuals, Covariances, ... R <- X - (Fy %x_3% alpha.temp %x_2% beta.temp) Delta.1 <- tcrossprod(mat(R, 3)) / n Delta.2 <- tcrossprod(mat(R, 2)) / n s <- mean(diag(Delta.1)) Delta.1.inv <- solve(Delta.1) Delta.2.inv <- solve(Delta.2) S.1 <- tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) / n # S.2 not needed # Re-evaluate negative log-likelihood loss.temp <- -0.5 * (n * (p * q * log(s) - p * log(det(Delta.1)) - q * log(det(Delta.2))) - s * sum(S.1 * Delta.1.inv)) # Armijo line search break condition if (loss.temp <= loss - 0.1 * delta * inner.prod) { break } } # Call logger (invoke history callback) if (is.function(logger)) { logger(iter, loss.temp, alpha.temp, beta.temp, Delta.1, Delta.2, delta) } # Enforce descent if (loss.temp < loss) { alpha0 <- alpha1 alpha1 <- alpha.temp beta0 <- beta1 beta1 <- beta.temp # check break conditions if (mean(abs(alpha1)) + mean(abs(beta1)) < eps) { break.reason <- "alpha, beta numerically zero" break # estimates are basically zero -> stop } if (inner.prod < eps * (p * q + r * k)) { break.reason <- "mean squared gradient is smaller than epsilon" break # mean squared gradient is smaller than epsilon -> stop } if (abs(loss.temp - loss) < eps) { break.reason <- "decrease is too small (slow)" break # decrease is too small (slow) -> stop } loss <- loss.temp no.nesterov <- FALSE # always reset } else if (!no.nesterov) { no.nesterov <- TRUE # retry without momentum next } else { break.reason <- "failed even without momentum" break # failed even without momentum -> stop } # update momentum scaling a0 <- a1 a1 <- nesterov.scaling(a1, iter) # Set next iter starting step.size to line searched step size # (while allowing it to encrease) step.size <- 1.618034 * delta } list( loss = loss, alpha = alpha1, beta = beta1, Delta.1 = Delta.1, Delta.2 = Delta.2, tr.Delta = s, break.reason = break.reason ) }