tensor_predictors/tensorPredictors/R/kpir_approx.R

#' Using unbiased (but not MLE) estimates for the Kronecker decomposed
#' covariance matrices Delta_1, Delta_2 for approximating the log-likelihood
#' giving a closed form solution for the gradient.
#'
#'      Delta_1 = n^-1 sum_i R_i' R_i,
#'      Delta_2 = n^-1 sum_i R_i R_i'.
#'
#' @export
kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
    max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
    nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)),
    eps = .Machine$double.eps,
    logger = NULL
) {

    # Check if X and Fy have same number of observations
    stopifnot(nrow(X) == NROW(Fy))
    n <- nrow(X)                        # Number of observations

    # Get and check predictor dimensions
    if (length(dim(X)) == 2L) {
        stopifnot(!missing(shape))
        stopifnot(ncol(X) == prod(shape[1:2]))
        p <- as.integer(shape[1])       # Predictor "height"
        q <- as.integer(shape[2])       # Predictor "width"
    } else if (length(dim(X)) == 3L) {
        p <- dim(X)[2]
        q <- dim(X)[3]
        dim(X) <- c(n, p * q)
    } else {
        stop("'X' must be a matrix or 3-tensor")
    }

    # Get and check response dimensions
    if (!is.array(Fy)) {
        Fy <- as.array(Fy)
    }
    if (length(dim(Fy)) == 1L) {
        k <- r <- 1L
        dim(Fy) <- c(n, 1L)
    } else if (length(dim(Fy)) == 2L) {
        stopifnot(!missing(shape))
        stopifnot(ncol(Fy) == prod(shape[3:4]))
        k <- as.integer(shape[3])       # Response functional "height"
        r <- as.integer(shape[4])       # Response functional "width"
    } else if (length(dim(Fy)) == 3L) {
        k <- dim(Fy)[2]
        r <- dim(Fy)[3]
        dim(Fy) <- c(n, k * r)
    } else {
        stop("'Fy' must be a vector, matrix or 3-tensor")
    }


    ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
    # Vectorize
    dim(Fy) <- c(n, k * r)
    dim(X)  <- c(n, p * q)
    # Solve
    cpFy <- crossprod(Fy)               # TODO: Check/Test and/or replace
    if (n <= k * r || qr(cpFy)$rank < k * r) {
        # In case of under-determined system replace the inverse in the normal
        # equation by the Moore-Penrose Pseudo Inverse
        B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
    } else {
        # Compute OLS estimate by the Normal Equation
        B <- t(solve(cpFy, crossprod(Fy, X)))
    }

    # De-Vectroize (from now on tensor arithmetics)
    dim(Fy) <- c(n, k, r)
    dim(X)  <- c(n, p, q)

    # Decompose `B = alpha x beta` into `alpha` and `beta`
    c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))

    # Compute residuals
    R <- X - (Fy %x_3% alpha0 %x_2% beta0)

    # Covariance estimates and scaling factor
    Delta.1 <- tcrossprod(mat(R, 3)) / n
    Delta.2 <- tcrossprod(mat(R, 2)) / n
    s <- mean(diag(Delta.1))

    # Inverse Covariances
    Delta.1.inv <- solve(Delta.1)
    Delta.2.inv <- solve(Delta.2)

    # cross dependent covariance estimates
    S.1 <- tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) / n
    S.2 <- tcrossprod(mat(R, 2), mat(R %x_3% Delta.1.inv, 2)) / n

    # Evaluate negative log-likelihood (2 pi term dropped)
    loss <- -0.5 * (n * (p * q * log(s) - p * log(det(Delta.1)) -
        q * log(det(Delta.2))) - s * sum(S.1 * Delta.1.inv))

    # Call history callback (logger) before the first iteration
    if (is.function(logger)) {
        logger(0L, loss, alpha0, beta0, Delta.1, Delta.2, NA)
    }


    ### Step 2: MLE estimate with LS solution as starting value
    a0 <- 0
    a1 <- 1
    alpha1 <- alpha0
    beta1 <- beta0

    # main descent loop
    no.nesterov <- TRUE
    break.reason <- NA
    for (iter in seq_len(max.iter)) {
        if (no.nesterov) {
            # without extrapolation as fallback
            alpha.moment <- alpha1
            beta.moment  <- beta1
        } else {
            # extrapolation using previous direction
            alpha.moment <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0)
            beta.moment  <-  beta1 + ((a0 - 1) / a1) * ( beta1 -  beta0)
        }

        # Extrapolated residuals
        R <- X - (Fy %x_3% alpha.moment %x_2% beta.moment)

        # Recompute Covariance Estimates and scaling factor
        Delta.1 <- tcrossprod(mat(R, 3)) / n
        Delta.2 <- tcrossprod(mat(R, 2)) / n
        s <- mean(diag(Delta.1))

        # Inverse Covariances
        Delta.1.inv <- solve(Delta.1)
        Delta.2.inv <- solve(Delta.2)

        # cross dependent covariance estimates
        S.1 <- tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) / n
        S.2 <- tcrossprod(mat(R, 2), mat(R %x_3% Delta.1.inv, 2)) / n

        # Gradient "generating" tensor
        G <- (sum(S.1 * Delta.1.inv) - p * q / s) * R
        G <- G + R %x_2% ((diag(q, p) - s * (Delta.2.inv %*% S.2)) %*% Delta.2.inv)
        G <- G + R %x_3% ((diag(p, q) - s * (Delta.1.inv %*% S.1)) %*% Delta.1.inv)
        G <- G + s * (R %x_2% Delta.2.inv %x_3% Delta.1.inv)

        # Calculate Gradients
        grad.alpha <- tcrossprod(mat(G, 3), mat(Fy %x_2% beta.moment, 3))
        grad.beta  <- tcrossprod(mat(G, 2), mat(Fy %x_3% alpha.moment, 2))

        # Backtracking line search (Armijo type)
        # The `inner.prod` is used in the Armijo break condition but does not
        # depend on the step size.
        inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)

        # backtracking loop
        for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) {
            # Update `alpha` and `beta` (note: add(+), the gradients are already
            # pointing into the negative slope direction of the loss cause they are
            # the gradients of the log-likelihood [NOT the negative log-likelihood])
            alpha.temp <- alpha.moment + delta * grad.alpha
            beta.temp  <- beta.moment  + delta * grad.beta

            # Update Residuals, Covariances, ...
            R <- X - (Fy %x_3% alpha.temp %x_2% beta.temp)
            Delta.1 <- tcrossprod(mat(R, 3)) / n
            Delta.2 <- tcrossprod(mat(R, 2)) / n
            s <- mean(diag(Delta.1))
            Delta.1.inv <- solve(Delta.1)
            Delta.2.inv <- solve(Delta.2)
            S.1 <- tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) / n
            # S.2 not needed

            # Re-evaluate negative log-likelihood
            loss.temp <- -0.5 * (n * (p * q * log(s) - p * log(det(Delta.1)) -
                q * log(det(Delta.2))) - s * sum(S.1 * Delta.1.inv))

            # Armijo line search break condition
            if (loss.temp <= loss - 0.1 * delta * inner.prod) {
                break
            }
        }

        # Call logger (invoke history callback)
        if (is.function(logger)) {
            logger(iter, loss.temp, alpha.temp, beta.temp, Delta.1, Delta.2, delta)
        }

        # Enforce descent
        if (loss.temp < loss) {
            alpha0 <- alpha1
            alpha1 <- alpha.temp
            beta0 <- beta1
            beta1 <- beta.temp

            # check break conditions
            if (mean(abs(alpha1)) + mean(abs(beta1)) < eps) {
                break.reason <- "alpha, beta numerically zero"
                break   # estimates are basically zero -> stop
            }
            if (inner.prod < eps * (p * q + r * k)) {
                break.reason <- "mean squared gradient is smaller than epsilon"
                break   # mean squared gradient is smaller than epsilon -> stop
            }
            if (abs(loss.temp - loss) < eps) {
                break.reason <- "decrease is too small (slow)"
                break   # decrease is too small (slow) -> stop
            }

            loss  <- loss.temp
            no.nesterov <- FALSE    # always reset
        } else if (!no.nesterov) {
            no.nesterov <- TRUE     # retry without momentum
            next
        } else {
            break.reason <- "failed even without momentum"
            break       # failed even without momentum -> stop
        }

        # update momentum scaling
        a0 <- a1
        a1 <- nesterov.scaling(a1, iter)

        # Set next iter starting step.size to line searched step size
        # (while allowing it to encrease)
        step.size <- 1.618034 * delta

    }

    list(
        loss = loss,
        alpha = alpha1, beta = beta1,
        Delta.1 = Delta.1, Delta.2 = Delta.2, tr.Delta = s,
        break.reason = break.reason
    )
}
wip: writing next method kpir_approx 2022-04-29 16:37:25 +00:00			`#' Using unbiased (but not MLE) estimates for the Kronecker decomposed`
			`#' covariance matrices Delta_1, Delta_2 for approximating the log-likelihood`
			`#' giving a closed form solution for the gradient.`
			`#'`
			`#' Delta_1 = n^-1 sum_i R_i' R_i,`
			`#' Delta_2 = n^-1 sum_i R_i R_i'.`
implemented kpir_approx, wip: kpir_sim 2022-05-05 11:35:07 +00:00			`#'`
wip: writing next method kpir_approx 2022-04-29 16:37:25 +00:00			`#' @export`
			`kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),`
			`max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,`
			`nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)),`
			`eps = .Machine$double.eps,`
			`logger = NULL`
			`) {`

			`# Check if X and Fy have same number of observations`
			`stopifnot(nrow(X) == NROW(Fy))`
			`n <- nrow(X) # Number of observations`

			`# Get and check predictor dimensions`
			`if (length(dim(X)) == 2L) {`
			`stopifnot(!missing(shape))`
			`stopifnot(ncol(X) == prod(shape[1:2]))`
			`p <- as.integer(shape[1]) # Predictor "height"`
			`q <- as.integer(shape[2]) # Predictor "width"`
			`} else if (length(dim(X)) == 3L) {`
			`p <- dim(X)[2]`
			`q <- dim(X)[3]`
			`dim(X) <- c(n, p * q)`
			`} else {`
			`stop("'X' must be a matrix or 3-tensor")`
			`}`

			`# Get and check response dimensions`
			`if (!is.array(Fy)) {`
			`Fy <- as.array(Fy)`
			`}`
			`if (length(dim(Fy)) == 1L) {`
			`k <- r <- 1L`
			`dim(Fy) <- c(n, 1L)`
			`} else if (length(dim(Fy)) == 2L) {`
			`stopifnot(!missing(shape))`
			`stopifnot(ncol(Fy) == prod(shape[3:4]))`
			`k <- as.integer(shape[3]) # Response functional "height"`
			`r <- as.integer(shape[4]) # Response functional "width"`
			`} else if (length(dim(Fy)) == 3L) {`
			`k <- dim(Fy)[2]`
			`r <- dim(Fy)[3]`
			`dim(Fy) <- c(n, k * r)`
			`} else {`
			`stop("'Fy' must be a vector, matrix or 3-tensor")`
			`}`


			### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
			`# Vectorize`
			`dim(Fy) <- c(n, k * r)`
			`dim(X) <- c(n, p * q)`
			`# Solve`
			`cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace`
			`if (n <= k * r \|\| qr(cpFy)$rank < k * r) {`
			`# In case of under-determined system replace the inverse in the normal`
			`# equation by the Moore-Penrose Pseudo Inverse`
			`B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))`
			`} else {`
			`# Compute OLS estimate by the Normal Equation`
			`B <- t(solve(cpFy, crossprod(Fy, X)))`
			`}`

			`# De-Vectroize (from now on tensor arithmetics)`
			`dim(Fy) <- c(n, k, r)`
			`dim(X) <- c(n, p, q)`

			# Decompose `B = alpha x beta` into `alpha` and `beta`
			`c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))`

			`# Compute residuals`
			`R <- X - (Fy %x_3% alpha0 %x_2% beta0)`

			`# Covariance estimates and scaling factor`
add: rtensornorm (sample tensor normal), add: mcrossprod (mode crossprod), wip: simulations, add: notes on multi-array (tensor) normal sampling, wip: initial estimates for alpha, beta, fix: scaling in kpir_approx 2022-05-06 20:27:24 +00:00			`Delta.1 <- tcrossprod(mat(R, 3)) / n`
			`Delta.2 <- tcrossprod(mat(R, 2)) / n`
			`s <- mean(diag(Delta.1))`
wip: writing next method kpir_approx 2022-04-29 16:37:25 +00:00
			`# Inverse Covariances`
			`Delta.1.inv <- solve(Delta.1)`
			`Delta.2.inv <- solve(Delta.2)`

			`# cross dependent covariance estimates`
add: rtensornorm (sample tensor normal), add: mcrossprod (mode crossprod), wip: simulations, add: notes on multi-array (tensor) normal sampling, wip: initial estimates for alpha, beta, fix: scaling in kpir_approx 2022-05-06 20:27:24 +00:00			`S.1 <- tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) / n`
			`S.2 <- tcrossprod(mat(R, 2), mat(R %x_3% Delta.1.inv, 2)) / n`
wip: writing next method kpir_approx 2022-04-29 16:37:25 +00:00
			`# Evaluate negative log-likelihood (2 pi term dropped)`
implemented kpir_approx, wip: kpir_sim 2022-05-05 11:35:07 +00:00			`loss <- -0.5 * (n * (p * q * log(s) - p * log(det(Delta.1)) -`
			`q * log(det(Delta.2))) - s * sum(S.1 * Delta.1.inv))`
wip: writing next method kpir_approx 2022-04-29 16:37:25 +00:00
			`# Call history callback (logger) before the first iteration`
			`if (is.function(logger)) {`
			`logger(0L, loss, alpha0, beta0, Delta.1, Delta.2, NA)`
			`}`


			`### Step 2: MLE estimate with LS solution as starting value`
			`a0 <- 0`
			`a1 <- 1`
			`alpha1 <- alpha0`
			`beta1 <- beta0`

			`# main descent loop`
			`no.nesterov <- TRUE`
			`break.reason <- NA`
			`for (iter in seq_len(max.iter)) {`
			`if (no.nesterov) {`
			`# without extrapolation as fallback`
			`alpha.moment <- alpha1`
			`beta.moment <- beta1`
			`} else {`
			`# extrapolation using previous direction`
			`alpha.moment <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0)`
			`beta.moment <- beta1 + ((a0 - 1) / a1) * ( beta1 - beta0)`
			`}`

implemented kpir_approx, wip: kpir_sim 2022-05-05 11:35:07 +00:00			`# Extrapolated residuals`
			`R <- X - (Fy %x_3% alpha.moment %x_2% beta.moment)`

			`# Recompute Covariance Estimates and scaling factor`
add: rtensornorm (sample tensor normal), add: mcrossprod (mode crossprod), wip: simulations, add: notes on multi-array (tensor) normal sampling, wip: initial estimates for alpha, beta, fix: scaling in kpir_approx 2022-05-06 20:27:24 +00:00			`Delta.1 <- tcrossprod(mat(R, 3)) / n`
			`Delta.2 <- tcrossprod(mat(R, 2)) / n`
			`s <- mean(diag(Delta.1))`
implemented kpir_approx, wip: kpir_sim 2022-05-05 11:35:07 +00:00
			`# Inverse Covariances`
			`Delta.1.inv <- solve(Delta.1)`
			`Delta.2.inv <- solve(Delta.2)`

			`# cross dependent covariance estimates`
add: rtensornorm (sample tensor normal), add: mcrossprod (mode crossprod), wip: simulations, add: notes on multi-array (tensor) normal sampling, wip: initial estimates for alpha, beta, fix: scaling in kpir_approx 2022-05-06 20:27:24 +00:00			`S.1 <- tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) / n`
			`S.2 <- tcrossprod(mat(R, 2), mat(R %x_3% Delta.1.inv, 2)) / n`
implemented kpir_approx, wip: kpir_sim 2022-05-05 11:35:07 +00:00
			`# Gradient "generating" tensor`
			`G <- (sum(S.1 * Delta.1.inv) - p * q / s) * R`
			`G <- G + R %x_2% ((diag(q, p) - s * (Delta.2.inv %% S.2)) %% Delta.2.inv)`
			`G <- G + R %x_3% ((diag(p, q) - s * (Delta.1.inv %% S.1)) %% Delta.1.inv)`
			`G <- G + s * (R %x_2% Delta.2.inv %x_3% Delta.1.inv)`

			`# Calculate Gradients`
			`grad.alpha <- tcrossprod(mat(G, 3), mat(Fy %x_2% beta.moment, 3))`
			`grad.beta <- tcrossprod(mat(G, 2), mat(Fy %x_3% alpha.moment, 2))`

			`# Backtracking line search (Armijo type)`
			# The `inner.prod` is used in the Armijo break condition but does not
			`# depend on the step size.`
			`inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)`

			`# backtracking loop`
			`for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) {`
			# Update `alpha` and `beta` (note: add(+), the gradients are already
			`# pointing into the negative slope direction of the loss cause they are`
			`# the gradients of the log-likelihood [NOT the negative log-likelihood])`
			`alpha.temp <- alpha.moment + delta * grad.alpha`
			`beta.temp <- beta.moment + delta * grad.beta`

			`# Update Residuals, Covariances, ...`
			`R <- X - (Fy %x_3% alpha.temp %x_2% beta.temp)`
add: rtensornorm (sample tensor normal), add: mcrossprod (mode crossprod), wip: simulations, add: notes on multi-array (tensor) normal sampling, wip: initial estimates for alpha, beta, fix: scaling in kpir_approx 2022-05-06 20:27:24 +00:00			`Delta.1 <- tcrossprod(mat(R, 3)) / n`
			`Delta.2 <- tcrossprod(mat(R, 2)) / n`
			`s <- mean(diag(Delta.1))`
implemented kpir_approx, wip: kpir_sim 2022-05-05 11:35:07 +00:00			`Delta.1.inv <- solve(Delta.1)`
			`Delta.2.inv <- solve(Delta.2)`
add: rtensornorm (sample tensor normal), add: mcrossprod (mode crossprod), wip: simulations, add: notes on multi-array (tensor) normal sampling, wip: initial estimates for alpha, beta, fix: scaling in kpir_approx 2022-05-06 20:27:24 +00:00			`S.1 <- tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) / n`
implemented kpir_approx, wip: kpir_sim 2022-05-05 11:35:07 +00:00			`# S.2 not needed`

			`# Re-evaluate negative log-likelihood`
			`loss.temp <- -0.5 * (n * (p * q * log(s) - p * log(det(Delta.1)) -`
			`q * log(det(Delta.2))) - s * sum(S.1 * Delta.1.inv))`

			`# Armijo line search break condition`
			`if (loss.temp <= loss - 0.1 * delta * inner.prod) {`
			`break`
			`}`
			`}`

			`# Call logger (invoke history callback)`
			`if (is.function(logger)) {`
			`logger(iter, loss.temp, alpha.temp, beta.temp, Delta.1, Delta.2, delta)`
			`}`

			`# Enforce descent`
			`if (loss.temp < loss) {`
			`alpha0 <- alpha1`
			`alpha1 <- alpha.temp`
			`beta0 <- beta1`
			`beta1 <- beta.temp`

			`# check break conditions`
			`if (mean(abs(alpha1)) + mean(abs(beta1)) < eps) {`
			`break.reason <- "alpha, beta numerically zero"`
			`break # estimates are basically zero -> stop`
			`}`
			`if (inner.prod < eps * (p * q + r * k)) {`
			`break.reason <- "mean squared gradient is smaller than epsilon"`
			`break # mean squared gradient is smaller than epsilon -> stop`
			`}`
			`if (abs(loss.temp - loss) < eps) {`
			`break.reason <- "decrease is too small (slow)"`
			`break # decrease is too small (slow) -> stop`
			`}`

			`loss <- loss.temp`
			`no.nesterov <- FALSE # always reset`
			`} else if (!no.nesterov) {`
			`no.nesterov <- TRUE # retry without momentum`
			`next`
			`} else {`
			`break.reason <- "failed even without momentum"`
			`break # failed even without momentum -> stop`
			`}`

			`# update momentum scaling`
			`a0 <- a1`
			`a1 <- nesterov.scaling(a1, iter)`

			`# Set next iter starting step.size to line searched step size`
			`# (while allowing it to encrease)`
			`step.size <- 1.618034 * delta`
wip: writing next method kpir_approx 2022-04-29 16:37:25 +00:00
implemented kpir_approx, wip: kpir_sim 2022-05-05 11:35:07 +00:00			`}`
wip: writing next method kpir_approx 2022-04-29 16:37:25 +00:00
implemented kpir_approx, wip: kpir_sim 2022-05-05 11:35:07 +00:00			`list(`
			`loss = loss,`
			`alpha = alpha1, beta = beta1,`
			`Delta.1 = Delta.1, Delta.2 = Delta.2, tr.Delta = s,`
			`break.reason = break.reason`
			`)`
wip: writing next method kpir_approx 2022-04-29 16:37:25 +00:00			`}`