tensor_predictors/tensorPredictors/R/kpir_approx.R

244 lines
8.6 KiB
R

#' Using unbiased (but not MLE) estimates for the Kronecker decomposed
#' covariance matrices Delta_1, Delta_2 for approximating the log-likelihood
#' giving a closed form solution for the gradient.
#'
#' Delta_1 = n^-1 sum_i R_i' R_i,
#' Delta_2 = n^-1 sum_i R_i R_i'.
#'
#' @export
kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)),
max.init.iter = 20L, init.method = c("ls", "vlp"),
eps = .Machine$double.eps,
logger = NULL
) {
# Check if X and Fy have same number of observations
stopifnot(nrow(X) == NROW(Fy))
n <- nrow(X) # Number of observations
# Check predictor dimensions
if (length(dim(X)) == 2L) {
stopifnot(!missing(shape))
stopifnot(ncol(X) == prod(shape[1:2]))
p <- as.integer(shape[1]) # Predictor "height"
q <- as.integer(shape[2]) # Predictor "width"
} else if (length(dim(X)) == 3L) {
p <- dim(X)[2]
q <- dim(X)[3]
} else {
stop("'X' must be a matrix or 3-tensor")
}
# Check response dimensions
if (!is.array(Fy)) {
Fy <- as.array(Fy)
}
if (length(dim(Fy)) == 1L) {
k <- r <- 1L
} else if (length(dim(Fy)) == 2L) {
stopifnot(!missing(shape))
stopifnot(ncol(Fy) == prod(shape[3:4]))
k <- as.integer(shape[3]) # Response functional "height"
r <- as.integer(shape[4]) # Response functional "width"
} else if (length(dim(Fy)) == 3L) {
k <- dim(Fy)[2]
r <- dim(Fy)[3]
} else {
stop("'Fy' must be a vector, matrix or 3-tensor")
}
### Step 1: (Approx) Least Squares initial estimate
init.method <- match.arg(init.method)
if (init.method == "ls") {
# De-Vectroize (from now on tensor arithmetics)
dim(Fy) <- c(n, k, r)
dim(X) <- c(n, p, q)
ls <- kpir.ls(X, Fy, max.iter = max.init.iter, sample.axis = 1L, eps = eps)
c(beta0, alpha0) %<-% ls$alphas
} else { # Van Loan and Pitsianis
# Vectorize
dim(Fy) <- c(n, k * r)
dim(X) <- c(n, p * q)
# solution for `X = Fy B' + epsilon`
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
if (n <= k * r || qr(cpFy)$rank < k * r) {
# In case of under-determined system replace the inverse in the normal
# equation by the Moore-Penrose Pseudo Inverse
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
} else {
# Compute OLS estimate by the Normal Equation
B <- t(solve(cpFy, crossprod(Fy, X)))
}
# De-Vectroize (from now on tensor arithmetics)
dim(Fy) <- c(n, k, r)
dim(X) <- c(n, p, q)
# Decompose `B = alpha x beta` into `alpha` and `beta`
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
}
# Compute residuals
R <- X - (Fy %x_3% alpha0 %x_2% beta0)
# Covariance estimates and scaling factor
Delta.1 <- tcrossprod(mat(R, 3)) / n
Delta.2 <- tcrossprod(mat(R, 2)) / n
s <- mean(diag(Delta.1))
# Inverse Covariances
Delta.1.inv <- solve(Delta.1)
Delta.2.inv <- solve(Delta.2)
# cross dependent covariance estimates
S.1 <- tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) / n
S.2 <- tcrossprod(mat(R, 2), mat(R %x_3% Delta.1.inv, 2)) / n
# Evaluate negative log-likelihood (2 pi term dropped)
loss <- -0.5 * (n * (p * q * log(s) - p * log(det(Delta.1)) -
q * log(det(Delta.2))) - s * sum(S.1 * Delta.1.inv))
# Call history callback (logger) before the first iteration
if (is.function(logger)) {
logger(0L, loss, alpha0, beta0, Delta.1, Delta.2, NA)
}
### Step 2: MLE estimate with LS solution as starting value
a0 <- 0
a1 <- 1
alpha1 <- alpha0
beta1 <- beta0
# main descent loop
no.nesterov <- TRUE
break.reason <- NA
for (iter in seq_len(max.iter)) {
if (no.nesterov) {
# without extrapolation as fallback
alpha.moment <- alpha1
beta.moment <- beta1
} else {
# extrapolation using previous direction
alpha.moment <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0)
beta.moment <- beta1 + ((a0 - 1) / a1) * ( beta1 - beta0)
}
# Extrapolated residuals
R <- X - (Fy %x_3% alpha.moment %x_2% beta.moment)
# Recompute Covariance Estimates and scaling factor
Delta.1 <- tcrossprod(mat(R, 3)) / n
Delta.2 <- tcrossprod(mat(R, 2)) / n
s <- mean(diag(Delta.1))
# Inverse Covariances
Delta.1.inv <- solve(Delta.1)
Delta.2.inv <- solve(Delta.2)
# cross dependent covariance estimates
S.1 <- tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) / n
S.2 <- tcrossprod(mat(R, 2), mat(R %x_3% Delta.1.inv, 2)) / n
# Gradient "generating" tensor
G <- (sum(S.1 * Delta.1.inv) - p * q / s) * R
G <- G + R %x_2% ((diag(q, p) - s * (Delta.2.inv %*% S.2)) %*% Delta.2.inv)
G <- G + R %x_3% ((diag(p, q) - s * (Delta.1.inv %*% S.1)) %*% Delta.1.inv)
G <- G + s * (R %x_2% Delta.2.inv %x_3% Delta.1.inv)
# Calculate Gradients
grad.alpha <- tcrossprod(mat(G, 3), mat(Fy %x_2% beta.moment, 3))
grad.beta <- tcrossprod(mat(G, 2), mat(Fy %x_3% alpha.moment, 2))
# Backtracking line search (Armijo type)
# The `inner.prod` is used in the Armijo break condition but does not
# depend on the step size.
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
# backtracking loop
for (delta in step.size * 0.618034^seq.int(0L, length = max.line.iter)) {
# Update `alpha` and `beta` (note: add(+), the gradients are already
# pointing into the negative slope direction of the loss cause they are
# the gradients of the log-likelihood [NOT the negative log-likelihood])
alpha.temp <- alpha.moment + delta * grad.alpha
beta.temp <- beta.moment + delta * grad.beta
# Update Residuals, Covariances, ...
R <- X - (Fy %x_3% alpha.temp %x_2% beta.temp)
Delta.1 <- tcrossprod(mat(R, 3)) / n
Delta.2 <- tcrossprod(mat(R, 2)) / n
s <- mean(diag(Delta.1))
Delta.1.inv <- solve(Delta.1)
Delta.2.inv <- solve(Delta.2)
S.1 <- tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) / n
# S.2 not needed
# Re-evaluate negative log-likelihood
loss.temp <- -0.5 * (n * (p * q * log(s) - p * log(det(Delta.1)) -
q * log(det(Delta.2))) - s * sum(S.1 * Delta.1.inv))
# Armijo line search break condition
if (loss.temp <= loss - 0.1 * delta * inner.prod) {
break
}
}
# Call logger (invoke history callback)
if (is.function(logger)) {
logger(iter, loss.temp, alpha.temp, beta.temp, Delta.1, Delta.2, delta)
}
# Enforce descent
if (loss.temp < loss) {
alpha0 <- alpha1
alpha1 <- alpha.temp
beta0 <- beta1
beta1 <- beta.temp
# check break conditions
if (mean(abs(alpha1)) + mean(abs(beta1)) < eps) {
break.reason <- "alpha, beta numerically zero"
break # estimates are basically zero -> stop
}
if (inner.prod < eps * (p * q + r * k)) {
break.reason <- "mean squared gradient is smaller than epsilon"
break # mean squared gradient is smaller than epsilon -> stop
}
if (abs(loss.temp - loss) < eps) {
break.reason <- "decrease is too small (slow)"
break # decrease is too small (slow) -> stop
}
loss <- loss.temp
no.nesterov <- FALSE # always reset
} else if (!no.nesterov) {
no.nesterov <- TRUE # retry without momentum
next
} else {
break.reason <- "failed even without momentum"
break # failed even without momentum -> stop
}
# update momentum scaling
a0 <- a1
a1 <- nesterov.scaling(a1, iter)
# Set next iter starting step.size to line searched step size
# (while allowing it to encrease)
step.size <- 1.618034 * delta
}
list(
loss = loss,
alpha = alpha1, beta = beta1,
Delta.1 = Delta.1, Delta.2 = Delta.2, tr.Delta = s,
break.reason = break.reason
)
}