#' Gradient Descent Bases Tensor Predictors method #' #' @export kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), max.iter = 500L, max.line.iter = 50L, step.size = 1e-3, max.init.iter = 20L, init.method = c("ls", "vlp"), eps = .Machine$double.eps, logger = NULL ) { # Check if X and Fy have same number of observations stopifnot(nrow(X) == NROW(Fy)) n <- nrow(X) # Number of observations # Get and check predictor dimensions if (length(dim(X)) == 2L) { stopifnot(!missing(shape)) stopifnot(ncol(X) == prod(shape[1:2])) p <- as.integer(shape[1]) # Predictor "height" q <- as.integer(shape[2]) # Predictor "width" } else if (length(dim(X)) == 3L) { p <- dim(X)[2] q <- dim(X)[3] dim(X) <- c(n, p * q) } else { stop("'X' must be a matrix or 3-tensor") } # Get and check response dimensions if (!is.array(Fy)) { Fy <- as.array(Fy) } if (length(dim(Fy)) == 1L) { k <- r <- 1L dim(Fy) <- c(n, 1L) } else if (length(dim(Fy)) == 2L) { stopifnot(!missing(shape)) stopifnot(ncol(Fy) == prod(shape[3:4])) k <- as.integer(shape[3]) # Response functional "height" r <- as.integer(shape[4]) # Response functional "width" } else if (length(dim(Fy)) == 3L) { k <- dim(Fy)[2] r <- dim(Fy)[3] dim(Fy) <- c(n, k * r) } else { stop("'Fy' must be a vector, matrix or 3-tensor") } ### Step 1: (Approx) Least Squares initial estimate init.method <- match.arg(init.method) if (init.method == "ls") { dim(X) <- c(n, p, q) dim(Fy) <- c(n, k, r) ls <- kpir.ls(X, Fy, max.iter = max.init.iter, sample.axis = 1L, eps = eps) c(beta, alpha) %<-% ls$alphas dim(X) <- c(n, p * q) dim(Fy) <- c(n, k * r) } else { # Van Loan and Pitsianis # solution for `X = Fy B' + epsilon` cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace if (n <= k * r || qr(cpFy)$rank < k * r) { # In case of under-determined system replace the inverse in the normal # equation by the Moore-Penrose Pseudo Inverse B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X)) } else { # Compute OLS estimate by the Normal Equation B <- t(solve(cpFy, crossprod(Fy, X))) } # Decompose `B = alpha x beta` into `alpha` and `beta` c(alpha, beta) %<-% approx.kronecker(B, c(q, r), c(p, k)) } # Compute residuals resid <- X - tcrossprod(Fy, kronecker(alpha, beta)) # Covariance estimate Delta <- crossprod(resid) / n # Transformed Residuals (using `matpow` as robust inversion algo, # uses Moore-Penrose Pseudo Inverse in case of singular `Delta`) resid.trans <- resid %*% matpow(Delta, -1) # Evaluate negative log-likelihood loss <- 0.5 * (n * log(det(Delta)) + sum(resid.trans * resid)) # Call history callback (logger) before the first iterate if (is.function(logger)) { logger(0L, loss, alpha, beta, Delta, NA) } ### Step 2: MLE with LS solution as starting value for (iter in seq_len(max.iter)) { # Sum over kronecker prod by observation (Face-Splitting Product) KR <- colSums(rowKronecker(Fy, resid.trans)) dim(KR) <- c(p, q, k, r) # `alpha` Gradient R.Alpha <- aperm(KR, c(2, 4, 1, 3)) dim(R.Alpha) <- c(q * r, p * k) grad.alpha <- c(R.Alpha %*% c(beta)) # `beta` Gradient R.Beta <- aperm(KR, c(1, 3, 2, 4)) dim(R.Beta) <- c(p * k, q * r) grad.beta <- c(R.Beta %*% c(alpha)) # Line Search (Armijo type) # The `inner.prod` is used in the Armijo break condition but does not # depend on the step size. inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2) # Line Search loop for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) { # Update `alpha` and `beta` (note: add(+), the gradients are already # pointing into the negative slope direction of the loss cause they are # the gradients of the log-likelihood [NOT the negative log-likelihood]) alpha.temp <- alpha + delta * grad.alpha beta.temp <- beta + delta * grad.beta # Update Residuals, Covariance and transformed Residuals resid <- X - tcrossprod(Fy, kronecker(alpha.temp, beta.temp)) Delta <- crossprod(resid) / n resid.trans <- resid %*% matpow(Delta, -1) # Evaluate negative log-likelihood loss.temp <- 0.5 * (n * log(det(Delta)) + sum(resid.trans * resid)) # Armijo line search break condition if (loss.temp <= loss - 0.1 * delta * inner.prod) { break } } # Call logger (invoce history callback) if (is.function(logger)) { logger(iter, loss.temp, alpha.temp, beta.temp, Delta, delta) } # Ensure descent if (loss.temp < loss) { alpha <- alpha.temp beta <- beta.temp # check break conditions (in descent case) if (mean(abs(alpha)) + mean(abs(beta)) < eps) { break # basically, estimates are zero -> stop } if (inner.prod < eps * (p * q + r * k)) { break # mean squared gradient is smaller than epsilon -> stop } if (abs(loss.temp - loss) < eps) { break # decrease is too small (slow) -> stop } loss <- loss.temp } else { break } # Set next iter starting step.size to line searched step size # (while allowing it to encrease) step.size <- 1.618034 * delta } list(loss = loss, alpha = alpha, beta = beta, Delta = Delta) }