#' Fitting Generalized Multi-Linear Models
#'
#' @export
GMLM <- function(...) {
    stop("Not Implemented")
}


make.gmlm.family <- function(name) {
    # standardize family name
    name <- list(
        normal = "normal", gaussian = "normal",
        bernoulli = "bernoulli", ising = "bernoulli"
    )[[tolower(name), exact = FALSE]]

    ############################################################################
    #                                                                          #
    #    TODO: better (and possibly specialized) initial parameters!?!?!?!     #
    #                                                                          #
    ############################################################################

    switch(name,
        normal = {
            initialize <- function(X, Fy) {
                p <- head(dim(X), -1)
                r <- length(p)

                # Mode-Covariances
                XSigmas <- mcov(X, sample.axis = r + 1L)
                YSigmas <- mcov(Fy, sample.axis = r + 1L)

                # Extract main mode covariance directions
                # Note: (the directions are transposed!)
                XDirs <- Map(function(Sigma) {
                    with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
                }, XSigmas)
                YDirs <- Map(function(Sigma) {
                    with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
                }, YSigmas)

                alphas <- Map(function(xdir, ydir) {
                    s <- min(ncol(xdir), nrow(ydir))
                    crossprod(xdir[seq_len(s), , drop = FALSE],
                              ydir[seq_len(s), , drop = FALSE])
                }, XDirs, YDirs)

                list(
                    eta1 = rowMeans(X, dims = r),
                    alphas = alphas,
                    Omegas = Map(diag, p)
                )
            }

            # parameters of the tensor normal computed from the GLM parameters
            params <- function(Fy, eta1, alphas, Omegas) {
                Deltas <- Map(solve, Omegas)
                mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
                list(mu_y = mu_y, Deltas = Deltas)
            }

            # scaled negative log-likelihood
            log.likelihood <- function(X, Fy, eta1, alphas, Omegas) {
                n <- tail(dim(X), 1)        # sample size

                # conditional mean
                mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas))

                # negative log-likelihood scaled by sample size
                # Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)`
                # migth fail, but `NAGD` has failsaves againt cases of "illegal"
                # parameters.
                suppressWarnings(
                    0.5 * prod(p) * log(2 * pi) +
                        sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) -
                            (0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p)
                )
            }
            # gradient of the scaled negative log-likelihood
            grad <- function(X, Fy, eta1, alphas, Omegas) {
                # retrieve dimensions
                n <- tail(dim(X), 1)        # sample size
                p <- head(dim(X), -1)       # predictor dimensions
                q <- head(dim(Fy), -1)      # response dimensions
                r <- length(p)              # single predictor/response tensor order

                ## "Inverse" Link: Tensor Normal Specific
                # known exponential family constants
                c1 <- 1
                c2 <- -0.5

                # Covariances from the GLM parameter Scatter matrices
                Deltas <- Map(solve, Omegas)

                # First moment via "inverse" link `g1(eta_y) = E[X | Y = y]`
                E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)

                # Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]`
                dim(E1) <- c(prod(p), n)
                E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1))
                ## end "Inverse" Link

                dim(X) <- c(prod(p), n)

                # Residuals
                R <- X - E1
                dim(R) <- c(p, n)

                # mean deviation between the sample covariance to GLM estimated covariance
                # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
                S <- rowMeans(colKronecker(X, X)) - E2  # <- Optimized for Tensor Normal
                dim(S) <- c(p, p)           # reshape to tensor or order `2 r`

                # Gradients of the negative log-likelihood scaled by sample size
                list(
                    "Dl(eta1)" = -c1 * rowMeans(R, dims = r),
                    "Dl(alphas)" = Map(function(j) {
                        (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
                    }, 1:r),
                    "Dl(Omegas)" = Map(function(j) {
                        deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
                        # addapt to symmetric constraint for the derivative
                        dim(deriv) <- c(p[j], p[j])
                        deriv + t(deriv * (1 - diag(p[j])))
                    }, 1:r)
                )
            }
        },
        bernoulli = {
            require(mvbernoulli)

            initialize <- function(X, Fy) {
                p <- head(dim(X), -1)
                r <- length(p)

                # Mode-Covariances
                XSigmas <- mcov(X, sample.axis = r + 1L)
                YSigmas <- mcov(Fy, sample.axis = r + 1L)

                # Extract main mode covariance directions
                # Note: (the directions are transposed!)
                XDirs <- Map(function(Sigma) {
                    with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
                }, XSigmas)
                YDirs <- Map(function(Sigma) {
                    with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
                }, YSigmas)

                alphas <- Map(function(xdir, ydir) {
                    s <- min(ncol(xdir), nrow(ydir))
                    crossprod(xdir[seq_len(s), , drop = FALSE],
                              ydir[seq_len(s), , drop = FALSE])
                }, XDirs, YDirs)

                list(
                    eta1 = array(0, dim = p),
                    alphas = alphas,
                    Omegas = Map(diag, p)
                )
            }

            # initialize <- function(X, Fy) {
            #     r <- length(dim(X)) - 1L

            #     # Mode-Covariances
            #     XSigmas <- mcov(X, sample.axis = r + 1L)
            #     YSigmas <- mcov(Fy, sample.axis = r + 1L)

            #     # Extract main mode covariance directions
            #     # Note: (the directions are transposed!)
            #     XDirs <- Map(function(Sigma) {
            #         with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
            #     }, XSigmas)
            #     YDirs <- Map(function(Sigma) {
            #         with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
            #     }, YSigmas)

            #     alphas <- Map(function(xdir, ydir) {
            #         s <- min(ncol(xdir), nrow(ydir))
            #         crossprod(xdir[seq_len(s), , drop = FALSE],
            #                   ydir[seq_len(s), , drop = FALSE])
            #     }, XDirs, YDirs)

            #     # Scatter matrices from Residuals (intercept not considered)
            #     Deltas <- mcov(X - mlm(Fy, alphas), sample.axis = r + 1L)
            #     Omegas <- Map(solve, Deltas)

            #     # and the intercept
            #     eta1 <- mlm(rowMeans(X, dims = r), Deltas)

            #     list(
            #         eta1 = eta1,
            #         alphas = alphas,
            #         Omegas = Omegas
            #     )
            # }

            params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
                # number of observations
                n <- tail(dim(Fy), 1)

                # natural exponential family parameters
                eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1))
                eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas))

                # # next the conditional Ising model parameters `theta_y`
                # theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n)
                # dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n)

                # ltri <- which(lower.tri(eta_y2, diag = TRUE))
                # diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
                # theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1)
                # theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ]

                # conditional Ising model parameters
                theta_y <- matrix(rep(vech(eta_y2), n), ncol = n)
                ltri <- which(lower.tri(eta_y2, diag = TRUE))
                diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
                theta_y[diagonal, ] <- eta_y1

                theta_y
            }

            # Scaled ngative log-likelihood
            log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
                # number of observations
                n <- tail(dim(X), 1L)

                # conditional Ising model parameters
                theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)

                # convert to binary data set
                storage.mode(X) <- "integer"
                X.mvb <- as.mvbinary(mat(X, length(dim(X))))

                # log-likelihood of the data set
                -mean(sapply(seq_len(n), function(i) {
                    ising_log_likelihood(theta_y[, i], X.mvb[i])
                }))
            }

            # Gradient of the scaled negative log-likelihood
            grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
                # retrieve dimensions
                n <- tail(dim(X), 1)        # sample size
                p <- head(dim(X), -1)       # predictor dimensions
                q <- head(dim(Fy), -1)      # response dimensions
                r <- length(p)              # single predictor/response tensor order

                ## "Inverse" Link: Ising Model Specific
                # conditional Ising model parameters: `p (p + 1) / 2` by `n`
                theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)

                # conditional expectations
                #   ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y]
                E2 <- apply(theta_y, 2L, ising_marginal_probs)
                #   convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y]
                E2 <- E2[vech.pinv.index(prod(p)), ]
                #   extract diagonal elements which are equal to E[vec(X) | Y = y]
                E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ]
                ## end "Inverse" Link

                dim(X) <- c(prod(p), n)

                # Residuals
                R <- X - E1
                dim(R) <- c(p, n)

                # mean deviation between the sample covariance to GLM estimated covariance
                # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
                S <- rowMeans(colKronecker(X, X) - E2)
                dim(S) <- c(p, p)           # reshape to tensor or order `2 r`

                # Gradients of the negative log-likelihood scaled by sample size
                list(
                    "Dl(eta1)" = -c1 * rowMeans(R, dims = r),
                    "Dl(alphas)" = Map(function(j) {
                        (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
                    }, 1:r),
                    "Dl(Omegas)" = Map(function(j) {
                        deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
                        # addapt to symmetric constraint for the derivative
                        dim(deriv) <- c(p[j], p[j])
                        deriv + t(deriv * (1 - diag(p[j])))
                    }, 1:r)
                )
            }
        }
    )

    list(
        family = name,
        initialize = initialize,
        params = params,
        # linkinv = linkinv,
        log.likelihood = log.likelihood,
        grad = grad
    )
}


#' @export
GMLM.default <- function(X, Fy, sample.axis = 1L,
    family = "normal",
    ...,
    eps = sqrt(.Machine$double.eps),
    logger = NULL
) {
    stopifnot(exprs = {
        length(sample.axis) == 1L
        (1L <= sample.axis) && (sample.axis <= length(dim(X)))
        (dim(X) == dim(Fy))[sample.axis]
    })

    # rearrange `X`, `Fy` such that the last axis enumerates observations
    axis.perm <- c(seq_along(dim(X))[-sample.axis], sample.axis)
    X  <- aperm(X,  axis.perm)
    Fy <- aperm(Fy, axis.perm)

    # setup family specific GLM (pseudo) "inverse" link
    family <- make.gmlm.family(family)

    # wrap logger in callback for NAGD
    callback <- if (is.function(logger)) {
        function(iter, params) {
            do.call(logger, c(list(iter), params))
        }
    }

    params.fit <- NAGD(
        fun.loss = function(params) {
            # scaled negative lig-likelihood
            #                            eta1         alphas       Omegas
            family$log.likelihood(X, Fy, params[[1]], params[[2]], params[[3]])
        },
        fun.grad = function(params) {
            # gradient of the scaled negative lig-likelihood
            #                  eta1         alphas       Omegas
            family$grad(X, Fy, params[[1]], params[[2]], params[[3]])
        },
        params = family$initialize(X, Fy), # initialen parameter estimates
        fun.lincomb = function(a, lhs, b, rhs) {
            list(
                a * lhs[[1]] + b * rhs[[1]],
                Map(function(l, r) a * l + b * r, lhs[[2]], rhs[[2]]),
                Map(function(l, r) a * l + b * r, lhs[[3]], rhs[[3]])
            )
        },
        fun.norm2 = function(params) {
            sum(unlist(params)^2)
        },
        callback = callback,
        ...
    )

    structure(params.fit, names = c("eta1", "alphas", "Omegas"))
}