#' HOPIR subroutine for the LS solution given preprocessed `X`, `Fy` and initial
#' values `alphas`.
#'
#' @keywords internal
HOPIR.ls <- function(X, Fy, alphas, sample.axis, algorithm, ..., logger) {
    # Get axis indices (observation modes)
    modes <- seq_along(dim(X))[-sample.axis]
    n <- dim(X)[sample.axis]                    # observation count (scalar)
    p <- dim(X)[-sample.axis]                   # predictor dimensions (vector)

    # Least Squares Deltas Estimates given alphas
    fun.Deltas <- function(alphas) {
        # Residuals
        R <- X - mlm(Fy, alphas, modes = modes)
        # `Delta` moment estimates
        Deltas <- Map(mcrossprod, list(R), mode = modes)
        Map(`*`, p / (n * prod(p)), Deltas)
    }

    # wrap logger, provide unified logger interface for all HOPIR subroutines
    if (is.function(logger)) {
        callback <- function(iter, alphas) {
            logger("ls", iter, alphas, fun.Deltas(alphas))
        }
    } else {
        callback <- NULL
    }

    if (algorithm == "icu") {
        # Call (proper parameterized) Iterative Cyclic Updating Optimizer
        alphas <- ICU(
            # Optimization Objective (MSE)
            fun.loss = function(alphas) mean((X - mlm(Fy, alphas, modes))^2),
            # Updating rule (optimal solution for `alpha_j` given the rest)
            fun.update = function(alphas, j) {
                Z <- mlm(Fy, alphas[-j], modes = modes[-j])
                # least squares solution for `alpha_j | alpha_i, i != j`
                alphas[[j]] <- t(solve(
                    mcrossprod(Z, Z, modes[j]), mcrossprod(Z, X, modes[j])
                ))
            },
            # Initial parameter estimates
            params = alphas,
            ...,
            callback = callback)
    } else {
        # Call (proper parameterized) Nesterov Accelerated Gradient Descent
        alphas <- NAGD(
            # Setup objective function (MSE)
            fun.loss = function(alphas) mean((X - mlm(Fy, alphas, modes))^2),
            # Gradient of the objective with respect to the parameter matirces `alpha_j`
            fun.grad = function(alphas) {
                # Residuals
                R <- X - mlm(Fy, alphas, modes)
                # Gradients for each alpha
                Map(function(j) {
                    # MLM of Fy with alpha_k, k in [r] \ j
                    Fa <- mlm(Fy, alphas[-j], modes[-j])
                    # Gradient of the loss with respect to alpha_j
                    (-2 / prod(dim(X))) * mcrossprod(R, Fa, modes[j])
                }, seq_along(modes))
            },
            params = alphas,
            # Linear Combination of parameters, basically: a * lhs + b * rhs for each
            # combination of elements in the LHS and RHS lists with scalars a, b.
            fun.lincomb = function(a, LHS, b, RHS) {
                Map(function(lhs, rhs) a * lhs + b * rhs, LHS, RHS)
            },
            # squared norm of parameters
            fun.norm2 = function(params) sum(unlist(params, use.names = FALSE)^2),
            ...,
            callback = callback)
    }

    # Final estimate includes Deltas
    list(alphas = alphas, Deltas = fun.Deltas(alphas))
}

#' HPOIR subroutine for the MLE estimation given preprocessed data and initial
#' alpha, Delta parameters
#'
#' @keywords internal
HOPIR.mle <- function(X, Fy, alphas, Deltas, sample.axis, algorithm, ..., logger) {
    # Get axis indices (observation modes)
    modes <- seq_along(dim(X))[-sample.axis]
    n <- dim(X)[sample.axis]                    # observation count (scalar)
    p <- dim(X)[-sample.axis]                   # predictor dimensions (vector)

    if (algorithm == "icu") {
        # Call (proper parameterized) Iterative Cyclic Updating Optimizer
        params <- ICU(
            # Optimization Objective (negative log-likelihood)
            fun.loss = function(params) {
                # residuals
                R <- X - mlm(Fy, params$alphas, modes)
                # negative log-likelihood (without additive constant term)
                0.5 * (
                    n * prod(p) * sum(log(unlist(Map(det, params$Deltas))) / p) +
                    sum(mlm(R, Map(solve, params$Deltas), modes) * R)
                )
            },
            # Updating rule, optimal solution for `alpha_j` or `Delta_j` given
            # all other parameters
            fun.update = function(params, index) {
                # residuals
                R <- X - mlm(Fy, params$alphas, modes)
                # mode (axis) index
                j <- index[2]
                if (index[1] == 1) {
                    # compute subterms
                    Delta.invs <- Map(solve, params$Deltas)
                    Delta.inv.alphas <- Map(`%*%`, Delta.invs, alphas)
                    XxDi  <- mlm(X,  Delta.invs[-j],       modes[-j])
                    Fxa   <- mlm(Fy,           alphas[-j], modes[-j])
                    FxDia <- mlm(Fy, Delta.inv.alphas[-j], modes[-j])
                    # alpha update
                    mcrossprod(XxDi, Fxa, modes[j]) %*%
                        solve(mcrossprod(FxDia, Fxa, modes[j]))
                } else { # index[1] == 2
                    # Delta update
                    (p[j] / (n * prod(p))) * mcrossprod(
                        mlm(R, Map(solve, params$Deltas[-j]), modes[-j]), R, modes[j])
                }
            },
            # collection of initial alpha and Delta parameters
            params = list(alphas = alphas, Deltas = Deltas),
            # parameter "path"-indices, first index {1, 2} toggles between alphas
            # and Deltas, second index is the mode.
            # Example: `list(list("a1", "a2", "a3"), list("D1", "D2", "D3"))[[c(2, 1)]] == "D1"`
            indices = c(Map(c, 1L, seq_along(modes)), Map(c, 2L, seq_along(modes))),
            ...,
            callback = if (is.function(logger)) {
                    function(iter, params) {
                        logger("mle", iter, params$alphas, params$Deltas)
                    }
                } else {
                    NULL
                })
    } else {
        # Call (proper parameterized) Nesterov Accelerated Gradient Descent
        # Note that only the `alphas` are subject of Gradient Descent and the
        # `Deltas` are additional parameters updated given the cueent `alphas`.
        # Meaning that the gradient is the gradient of the loss with respect to
        # the `alphas` only.
        params <- NAGD(
            # Setup objective function (negative log-likelihood)
            fun.loss = function(alphas, Deltas) {
                # residuals
                R <- X - mlm(Fy, alphas, modes)
                # negative log-likelihood (without additive constant term)
                0.5 * (
                    n * prod(p) * sum(log(unlist(Map(det, Deltas))) / p) +
                    sum(mlm(R, Map(solve, Deltas), modes) * R)
                )
            },
            # Gradient of the objective with respect to the parameter matirces
            # `alphas` only, only the first argument, the second are `more.params`.
            fun.grad = function(alphas, Deltas) {
                # Residuals
                R <- X - mlm(Fy, alphas, modes)
                # Gradients for each alpha
                Map(function(j) {
                    # MLM of Fy with alpha_k, k in [r] \ j
                    Fa <- mlm(Fy, alphas[-j], modes[-j])
                    # Gradient of the loss with respect to alpha_j
                    (-2 / prod(dim(X))) * mcrossprod(R, Fa, modes[j])
                }, seq_along(modes))
            },
            # Initial parameters (subject to Gradient Descent)
            params = alphas,
            # Initial additional parameters (subject to updating given `params`)
            more.params = Deltas,
            # Update `Deltas` given `alphas`
            fun.more.params = function(alphas, old.Deltas) {
                # Residuals
                R <- X - mlm(Fy, alphas, modes)
                # Solve cross dependent System of Delta equations using the ICU
                # algorithm (with a small number of iterations, should be enough)
                ICU(
                    fun.loss = function(Deltas) {
                        # negative log-likelihood (without additive constant term)
                        0.5 * (
                            n * prod(p) * sum(log(unlist(Map(det, Deltas))) / p) +
                            sum(mlm(R, Map(solve, Deltas), modes) * R)
                        )
                    },
                    fun.update = function(Deltas, j) {
                        # Delta update
                        (p[j] / (n * prod(p))) * mcrossprod(
                            mlm(R, Map(solve, Deltas[-j]), modes[-j]), R, modes[j])
                    },
                    params = old.Deltas,
                    max.iter = 5L)
            },
            # Linear Combination of parameters, basically: a * lhs + b * rhs for each
            # combination of elements in the LHS and RHS lists with scalars a, b.
            fun.lincomb = function(a, LHS, b, RHS) {
                Map(function(lhs, rhs) a * lhs + b * rhs, LHS, RHS)
            },
            # squared norm of parameters
            fun.norm2 = function(params) sum(unlist(params, use.names = FALSE)^2),
            ...,
            callback = if (is.function(logger)) {
                    function(iter, alphas, Deltas) {
                        logger("mle", iter, alphas, Deltas)
                    }
                } else {
                    NULL
                })
        # Remap parameter names
        names(params) <- c("alphas", "Deltas")
    }

    params
}

#' Higher Order Parametric Inverse Regression
#'
#' @export
HOPIR <- function(X, Fy, sample.axis, method = c("ls", "mle"),
    algorithm = c("icu", "nagd"), ..., center = TRUE, logger = NULL
) {
    # Predair and check input parameters
    method <- match.arg(method)
    algorithm <- match.arg(algorithm)
    # ensure response is tensor valued
    if (!is.array(Fy)) {
        # scalar response case (add new axis of size 1)
        dim(Fy) <- ifelse(seq_along(dim(X)) == sample.axis, dim(X)[sample.axis], 1L)
    }
    # Check dimensions and matching of axis (tensor order)
    stopifnot(exprs = {
        length(dim(X)) == length(dim(Fy))
        dim(X)[sample.axis] == dim(Fy)[sample.axis]
    })
    # warn about occurence of an axis without reduction
    if (any(dim(Fy)[-sample.axis] >= dim(X)[-sample.axis])) {
        warning("Degenerate case 'any(dim(Fy)[-sample.axis] >= dim(X)[-sample.axis])'")
    }

    # Set the default logger
    if (is.logical(logger) && logger) {
        start <- Sys.time()
        logger <- function(method, iter, ...) {
            if (iter) {
                cat(sprintf("%4d - %s - Elapsed: %s\n",
                    iter, method, format(Sys.time() - start)))
            }
        }
    }

    # Get axis indices (observation modes)
    modes <- seq_along(dim(X))[-sample.axis]
    n <- dim(X)[sample.axis]                    # observation count (scalar)
    p <- dim(X)[-sample.axis]                   # predictor dimensions (vector)

    # center data (predictors and responses)
    if (center) {
        # Means for X and Fy (a.k.a. sum elements over the sample axis)
        meanX <- apply(X, modes, mean, simplify = TRUE)
        meanFy <- apply(Fy, modes, mean, simplify = TRUE)
        # Center both X and Fy
        X <- sweep(X, modes, meanX)
        Fy <- sweep(Fy, modes, meanFy)
    } else {
        meanX <- meanFy <- NA
    }


    ### Step 0: Initial parameter estimates HOPCA
    alphas <- Map(function(mode, ncol) {
        La.svd(mcrossprod(X, mode = mode), ncol)$u
    }, modes, dim(Fy)[modes])

    ### Step 1: LS estimate
    ls <- HOPIR.ls(X, Fy, alphas, sample.axis,
        algorithm, ..., logger = logger)

    ### Step 2: MLE estimate
    if (method == "mle") {
        mle <- HOPIR.mle(X, Fy, ls$alphas, ls$Deltas, sample.axis,
            algorithm, ..., logger = logger)

        # add means and LS estimates as attribute
        structure(
            c(mle, list(meanX = meanX, meanFy = meanFy)),
            ls = ls
        )
    } else {
        # add means to LS estimates
        c(ls, list(meanX = meanX, meanFy = meanFy))
    }
}