#' Specialized version of the GMLM for the Ising model (inverse Ising problem)
#'
#' @export
gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
    max.iter = 1000L,
    eps = sqrt(.Machine$double.eps),
    step.size = function(iter) 1e-2 * (1000 / (iter + 1000)),
    zig.zag.threashold = 5L,
    patience = 5L,
    nr.slices = 10L,                    # only for univariate `F(y) = y`
    slice.method = c("cut", "ecdf"),    # only for univariate `F(y) = y` and `y` is a factor or integer
    logger = function(...) { }
) {
    # # Special case for univariate response `vec F(y) = y`
    # # Due to high computational costs we use slicing
    # if (length(F) == prod(dim(F))) {
    #     y <- as.vector(F)
    #     if (!(is.factor(y) || is.integer(y))) {
    #         slice.method <- match.arg(slice.method)
    #         if (slice.method == "ecdf") {
    #             y <- cut(ecdf(y)(y), nr.slices)
    #         } else {
    #             y <- cut(y, nr.slices)
    #         }
    #     }
    #     slices <- split(seq_len(sample.size), y, drop = TRUE)
    # } else {
    #     slices <- seq_len(sample.size)
    # }

    dimX <- head(dim(X), -1)
    dimF <- head(dim(F), -1)
    sample.axis <- length(dim(X))
    modes <- seq_len(length(dim(X)) - 1)
    sample.size <- tail(dim(X), 1)

    betas <- Map(matrix, Map(rnorm, dimX * dimF), dimX)
    Omegas <- Map(diag, dimX)

    grad2_betas <- Map(array, 0, Map(dim, betas))
    grad2_Omegas <- Map(array, 0, Map(dim, Omegas))

    # Keep track of the last loss to accumulate loss difference sign changes
    # indicating optimization instabilities as a sign to stop
    last_loss <- Inf
    accum_sign <- 1

    # non improving iteration counter
    non_improving <- 0L

    # technical access points to dynamicaly access a multi-dimensional array
    # with its last index
    `X[..., i]` <- slice.expr(X, sample.axis, index = i)
    `F[..., i]` <- slice.expr(F, sample.axis, index = i, drop = FALSE)
    # the next expression if accessing the precomputed `mlm(F, betas)`
    `BF[..., i]` <- slice.expr(BF, sample.axis, nr.axis = sample.axis, drop = FALSE) # BF[..., i]


    # Iterate till a break condition triggers or till max. nr. of iterations
    for (iter in seq_len(max.iter)) {

        grad_betas <- Map(matrix, 0, dimX, dimF)
        Omega <- Reduce(kronecker, rev(Omegas))

        R2 <- array(0, dim = c(dimX, dimX))

        # negative log-likelihood
        loss <- 0

        BF <- mlm(F, betas)

        for (i in seq_len(sample.size)) {
            params_i <- Omega + diag(as.vector(eval(`BF[..., i]`)))
            m2_i <- ising_m2(params_i)

            # accumulate loss
            x_i <- as.vector(eval(`X[..., i]`))
            loss <- loss - (sum(x_i * (params_i %*% x_i)) + log(attr(m2_i, "prob_0")))

            R2_i <- tcrossprod(x_i) - m2_i
            R1_i <- diag(R2_i)
            dim(R1_i) <- dimX

            for (j in modes) {
                grad_betas[[j]] <- grad_betas[[j]] +
                    mcrossprod(
                        R1_i, mlm(eval(`F[..., i]`), betas[-j], modes[-j]), j,
                        dimB = ifelse(j != modes, dimX, dimF)
                    )
            }
            R2 <- R2 + as.vector(R2_i)
        }

        grad_Omegas <- Map(function(j) {
            grad <- mlm(kronperm(R2), Map(as.vector, Omegas[-j]), modes[-j], transposed = TRUE)
            dim(grad) <- dim(Omegas[[j]])
            grad
        }, modes)


        # update and accumulate alternating loss
        accum_sign <- sign(last_loss - loss) - accum_sign
        # check if accumulated alternating signs exceed stopping threshold
        if (abs(accum_sign) > zig.zag.threashold) { break }
        # increment non improving counter if thats the case
        if (!(loss < last_loss)) {
            non_improving <- non_improving + 1L
        } else {
            non_improving <- 0L
        }
        if (non_improving > patience) { break }

        # store current loss for the next iteration
        last_loss <- loss

        # Accumulate root mean squared gradiends
        grad2_betas  <- Map(function(g2, g) 0.9 * g2 + 0.1 * (g * g),
            grad2_betas, grad_betas)
        grad2_Omegas <- Map(function(g2, g) 0.9 * g2 + 0.1 * (g * g),
            grad2_Omegas, grad_Omegas)

        # logging (before parameter update)
        logger(iter, loss, betas, Omegas, grad_betas, grad_Omegas)

        # gradualy decrease the step size
        step <- if (is.function(step.size)) step.size(iter) else step.size

        # Update Parameters
        betas <- Map(function(beta, grad, m2) {
            beta + (step / (sqrt(m2) + eps)) * grad
        }, betas, grad_betas, grad2_betas)
        Omegas <- Map(function(Omega, grad, m2) {
            Omega + (step / (sqrt(m2) + eps)) * grad
        }, Omegas, grad_Omegas, grad2_Omegas)
    }

    list(betas = betas, Omegas = Omegas)
}