#' Specialized version of the GMLM for the Ising model (inverse Ising problem)
#'
#' @todo TODO: Add beta and Omega projections
#'
#' @export
gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
    # proj.betas = ..., proj.Omegas = ...,  # TODO: this
    max.iter = 1000L,
    eps = sqrt(.Machine$double.eps),
    step.size = 1e-3,
    zig.zag.threashold = 20L,
    patience = 3L,
    nr.slices = 20L,                            # only for univariate `F(y) = y`
    slice.method = c("cut", "ecdf", "none"),    # only for univariate `F(y) = y` and `y` is a factor or integer
    logger = function(...) { }
) {
    # Get problem dimensions
    dimX <- dim(X)[-sample.axis]
    # threat scalar `F` as a tensor
    if (is.null(dim(F))) {
        dimF <- rep(1L, length(dimX))
        dim(F) <- ifelse(seq_along(dim(X)) == sample.axis, sample.size, 1L)
    } else {
        dimF <- dim(F)[-sample.axis]
    }
    sample.size <- dim(X)[sample.axis]

    # rearrange `X`, `F` such that the last axis enumerates observations
    if (sample.axis != length(dim(X))) {
        axis.perm <- c(seq_along(dim(X))[-sample.axis], sample.axis)
        X <- aperm(X, axis.perm)
        F <- aperm(F, axis.perm)
        sample.axis <- length(dim(X))
    }
    modes <- seq_along(dimX)

    # Special case for univariate response `vec F(y) = y`
    # Due to high computational costs we use slicing
    slice.method <- match.arg(slice.method)
    slices.ind <- if ((slice.method != "none") && (length(F) == prod(dim(F)))) {
        y <- as.vector(F)
        if (!(is.factor(y) || is.integer(y))) {
            slice.method <- match.arg(slice.method)
            if (slice.method == "ecdf") {
                y <- cut(ecdf(y)(y), nr.slices)
            } else {
                y <- cut(y, nr.slices)
            }
        }
        split(seq_len(sample.size), y, drop = TRUE)
    } else {
        seq_len(sample.size)
    }


    # initialize betas with tensor normal estimate (ignoring data being binary)
    fit_normal <- gmlm_tensor_normal(X, F, sample.axis = length(dim(X)))
    betas <- fit_normal$betas

    Omegas <- Omegas.init <- Map(function(mode) {
        n <- prod(dim(X)[-mode])
        prob2 <- mcrossprod(X, mode = mode) / n
        prob2[prob2 == 0] <- 1 / n
        prob1 <- diag(prob2)
        `prob1^2` <- outer(prob1, prob1)

        `diag<-`(log(((1 - `prob1^2`) / `prob1^2`) * prob2 / (1 - prob2)), 0)
    }, modes)

    # Determin degenerate combinations, that are variables which are exclusive
    # in the data set
    matX <- mat(X, sample.axis)
    degen <- crossprod(matX) == 0
    degen.mask <- which(degen)
    # If there are degenerate combination, compute an (arbitrary) bound the
    # log odds parameters of those combinations
    if (any(degen.mask)) {
        degen.ind <- arrayInd(degen.mask, dim(degen))
        meanX <- colMeans(matX)
        prodX <- meanX[degen.ind[, 1]] * meanX[degen.ind[, 2]]
        degen.bounds <- log((1 - prodX) / (prodX * sample.size))
        # Component indices in Omegas of degenerate two-way interactions
        degen.ind <- arrayInd(degen.mask, rep(dimX, 2))
        degen.ind <- Map(function(d, m) {
            degen.ind[, m] + dimX[m] * (degen.ind[, m + length(dimX)] - 1L)
        }, dimX, seq_along(dimX))

        ## Enforce initial value degeneracy interaction param. constraints
        # Extract parameters corresponding to degenerate interactions
        degen.params <- do.call(rbind, Map(`[`, Omegas, degen.ind))
        # Degeneracy Constrained Parameters (sign is dropped)
        DCP <- mapply(function(vals, bound) {
            logVals <- log(abs(vals))
            err <- max(0, sum(logVals) - log(abs(bound)))
            exp(logVals - (err / length(vals)))
        }, split(degen.params, col(degen.params)), degen.bounds)
        # Update values in Omegas such that all degeneracy constraints hold
        Omegas <- Map(function(Omega, cp, ind) {
            # Combine multiple constraints for every element into single
            # constraint value per element
            cp <- mapply(min, split(abs(cp), ind))
            ind <- as.integer(names(cp))
            `[<-`(Omega, ind, sign(Omega[ind]) * cp)
        }, Omegas, split(DCP, row(DCP)), degen.ind)
    }

    # Initialize mean squared gradients
    grad2_betas <- Map(array, 0, Map(dim, betas))
    grad2_Omegas <- Map(array, 0, Map(dim, Omegas))

    # Keep track of the last loss to accumulate loss difference sign changes
    # indicating optimization instabilities as a sign to stop
    last_loss <- Inf
    accum_sign <- 1

    # non improving iteration counter
    non_improving <- 0L

    # technical access points to dynamicaly access a multi-dimensional array
    `X[..., i]` <- slice.expr(X, sample.axis, index = i, drop = FALSE)
    `F[..., i]` <- slice.expr(F, sample.axis, index = i, drop = FALSE)

    # Iterate till a break condition triggers or till max. nr. of iterations
    for (iter in seq_len(max.iter)) {

        grad_betas <- Map(matrix, 0, dimX, dimF)
        Omega <- Reduce(kronecker, rev(Omegas))

        # second order residuals accumulator
        # `sum_i (X_i o X_i - E[X o X | Y = y_i])`
        R2 <- array(0, dim = c(dimX, dimX))

        # negative log-likelihood
        loss <- 0

        for (i in slices.ind) {
            # slice size (nr. of objects in the slice)
            n_i <- length(i)

            sumF_i <- rowSums(eval(`F[..., i]`), dims = length(dimF))

            diag_params_i <- mlm(sumF_i / n_i, betas)
            params_i <- Omega + diag(as.vector(diag_params_i))
            m2_i <- ising_m2(params_i)

            # accumulate loss
            matX_i <- mat(eval(`X[..., i]`), modes)
            loss <- loss - (
                sum(matX_i * (params_i %*% matX_i)) + n_i * log(attr(m2_i, "prob_0"))
            )

            R2_i <- tcrossprod(matX_i) - n_i * m2_i
            R1_i <- diag(R2_i)
            dim(R1_i) <- dimX

            for (j in modes) {
                grad_betas[[j]] <- grad_betas[[j]] +
                    mcrossprod(R1_i, mlm(sumF_i, betas[-j], modes[-j]), j)
            }
            R2 <- R2 + as.vector(R2_i)
        }

        grad_Omegas <- Map(function(j) {
            grad <- mlm(kronperm(R2), Map(as.vector, Omegas[-j]), modes[-j], transposed = TRUE)
            dim(grad) <- dim(Omegas[[j]])
            grad
        }, modes)


        # update optimization behavioral trackers
        accum_sign <- sign(last_loss - loss) - accum_sign
        non_improving <- max(0L, non_improving - 1L + 2L * (last_loss < loss))

        # check break conditions
        if (abs(accum_sign) > zig.zag.threashold) { break }
        if (non_improving > patience) { break }
        if (abs(last_loss - loss) < eps * last_loss) { break }

        # store current loss for the next iteration
        last_loss <- loss

        # Accumulate root mean squared gradiends
        grad2_betas  <- Map(function(g2, g) 0.9 * g2 + 0.1 * (g * g),
            grad2_betas, grad_betas)
        grad2_Omegas <- Map(function(g2, g) 0.9 * g2 + 0.1 * (g * g),
            grad2_Omegas, grad_Omegas)

        # logging (before parameter update)
        logger(iter, loss, betas, Omegas, grad_betas, grad_Omegas)

        # Update Parameters
        betas <- Map(function(beta, grad, m2) {
            beta + (step.size / (sqrt(m2) + eps)) * grad
        }, betas, grad_betas, grad2_betas)
        Omegas <- Map(function(Omega, grad, m2) {
            Omega + (step.size / (sqrt(m2) + eps)) * grad
        }, Omegas, grad_Omegas, grad2_Omegas)

        # Enforce degeneracy parameter constraints
        if (any(degen.mask)) {
            # Extract parameters corresponding to degenerate interactions
            degen.params <- do.call(rbind, Map(`[`, Omegas, degen.ind))
            # Degeneracy Constrained Parameters (sign is dropped)
            DCP <- mapply(function(vals, bound) {
                logVals <- log(abs(vals))
                err <- max(0, sum(logVals) - log(abs(bound)))
                exp(logVals - (err / length(vals)))
            }, split(degen.params, col(degen.params)), degen.bounds)
            # Update values in Omegas such that all degeneracy constraints hold
            Omegas <- Map(function(Omega, cp, ind) {
                # Combine multiple constraints for every element into single
                # constraint value per element
                cp <- mapply(min, split(abs(cp), ind))
                ind <- as.integer(names(cp))
                `[<-`(Omega, ind, sign(Omega[ind]) * cp)
            }, Omegas, split(DCP, row(DCP)), degen.ind)
        }
    }

    structure(
        list(eta1 = array(0, dimX), betas = betas, Omegas = Omegas),
        tensor_normal = fit_normal,
        Omegas.init = Omegas.init,
        degen.mask = degen.mask
    )
}


################################################################################
###      Development Interactive Block (Delete / Make sim / TODO: ...)       ###
################################################################################
if (FALSE) { # interactive()

par(bg = "#1d1d1d",
    fg = "lightgray",
    col      = "#d5d5d5",
    col.axis = "#d5d5d5",
    col.lab  = "#d5d5d5",
    col.main = "#d5d5d5",
    col.sub  = "#d5d5d5", # col.sub  = "#2467d0"
    pch = 16
)
cex <- 1.25
col <- colorRampPalette(c("#f15050", "#1d1d1d", "#567DCA"))(256)


.logger <- function() {
    iter <- 0L
    assign("log", data.frame(
        iter            = rep(NA_integer_, 100000),
        loss            = rep(NA_real_, 100000),
        dist.B          = rep(NA_real_, 100000),
        dist.Omega      = rep(NA_real_, 100000),
        norm.grad.B     = rep(NA_real_, 100000),
        norm.grad.Omega = rep(NA_real_, 100000)
    ), envir = .GlobalEnv)
    assign("B.gmlm", NULL, .GlobalEnv)
    assign("Omega.gmlm", NULL, .GlobalEnv)

    function(it, loss, betas, Omegas, grad_betas, grad_Omegas) {
        # Store in global namespace (allows to stop and get the results)
        B.gmlm <- Reduce(kronecker, rev(betas))
        assign("B.gmlm", B.gmlm, .GlobalEnv)
        Omega.gmlm <- Reduce(kronecker, rev(Omegas))
        assign("Omega.gmlm", Omega.gmlm, .GlobalEnv)

        dist.B          <- dist.subspace(B.true, B.gmlm, normalize = TRUE)
        dist.Omega      <- norm(Omega.true - Omega.gmlm, "F")
        norm.grad.B     <- sqrt(sum(mapply(norm, grad_betas, "F")^2))
        norm.grad.Omega <- sqrt(sum(mapply(norm, grad_Omegas, "F")^2))

        log[iter <<- iter + 1L, ] <<- list(
            it, loss, dist.B, dist.Omega, norm.grad.B, norm.grad.Omega
        )
        cat(sprintf("\r%3d - d(B): %.3f, d(O): %.3f, |g(B)|: %.3f, |g(O)|: %.3f, loss: %.3f\033[K",
            it, dist.B, dist.Omega, norm.grad.B, norm.grad.Omega, loss))
    }
}


sample.size <- 1000
dimX <- c(2, 3)              # predictor `X` dimension
dimF <- rep(1, length(dimX))    # "function" `F(y)` of responce `y` dimension

betas <- Map(diag, 1, dimX, dimF)
Omegas <- list(toeplitz(c(0, -2)), toeplitz(seq(1, 0, by = -0.5)))

B.true <- Reduce(kronecker, rev(betas))
Omega.true <- Reduce(kronecker, rev(Omegas))

# data sampling routine
c(X, F, y, sample.axis) %<-% (sample.data <- function(sample.size, betas, Omegas) {
    dimX <- mapply(nrow, betas)
    dimF <- mapply(ncol, betas)

    # generate response (sample axis is last axis)
    y <- runif(prod(sample.size, dimF), -2, 2)
    F <- array(y, dim = c(dimF, sample.size))    # ~ U[-1, 1]

    Omega <- Reduce(kronecker, rev(Omegas))

    X <- apply(F, length(dim(F)), function(Fi) {
        dim(Fi) <- dimF
        params <- diag(as.vector(mlm(Fi, betas))) + Omega
        tensorPredictors::ising_sample(1, params)
    })
    dim(X) <- c(dimX, sample.size)

    list(X = X, F = F, y = y, sample.axis = length(dim(X)))
})(sample.size, betas, Omegas)

local({
    X.proto <- array(seq_len(prod(dimX)), dimX)
    interactions <- crossprod(mat(X, sample.axis))
    dimnames(interactions) <- rep(list(
        do.call(paste0, c("X", Map(slice.index, list(X.proto), seq_along(dimX))))
    ), 2)
    cat("Sample Size: ", sample.size, "\n")
    print.table(interactions, zero.print = ".")
})

# system.time({
#     fit.gmlm <- gmlm_ising(X, y, logger = .logger())
# })
Rprof()
gmlm_ising(X, y)
Rprof(NULL)
summaryRprof()

B.gmlm <- Reduce(kronecker, rev(fit.gmlm$betas))
Omega.gmlm <- Reduce(kronecker, rev(fit.gmlm$Omegas))

B.normal <- Reduce(kronecker, rev(attr(fit.gmlm, "tensor_normal")$betas))
Omega.init <- Reduce(kronecker, rev(attr(fit.gmlm, "Omegas.init")))
degen.mask <- attr(fit.gmlm, "degen.mask")

local({
    layout(matrix(c(
        1, 2, 3, 3, 3,
        1, 4, 5, 6, 7
    ), nrow = 2, byrow = TRUE), width = c(6, 3, 1, 1, 1))

    with(na.omit(log), {
        plot(range(iter), c(0, 1), type = "n", bty = "n",
            xlab = "Iterations", ylab = "Distance")

        lines(iter, dist.B, col = "red", lwd = 2)
        lines(iter, dist.Omega / max(dist.Omega), col = "blue", lwd = 2)
        lines(iter, (loss - min(loss)) / diff(range(loss)), col = "darkgreen", lwd = 2)

        norm.grad <- sqrt(norm.grad.B^2 + norm.grad.Omega^2)
        # Scale all gradient norms
        norm.grad.B     <- norm.grad.B     / max(norm.grad)
        norm.grad.Omega <- norm.grad.Omega / max(norm.grad)
        norm.grad       <- norm.grad       / max(norm.grad)
        lines(iter, norm.grad.B,     lty = 2, col = "red")
        lines(iter, norm.grad.Omega, lty = 2, col = "blue")
        lines(iter, norm.grad,       lty = 2, col = "darkgreen")

        axis(4, at = c(
            tail(dist.B, 1),
            min(dist.B)
        ), labels = round(c(
            tail(dist.B, 1),
            min(dist.B)
        ), 2), col = NA, col.ticks = "red", las = 1)
        axis(4, at = c(
            1,
            tail(dist.Omega, 1) / max(dist.Omega),
            min(dist.Omega) / max(dist.Omega)
        ), labels = round(c(
            max(dist.Omega),
            tail(dist.Omega, 1),
            min(dist.Omega)
        ), 2), col = NA, col.ticks = "blue", las = 1)

        abline(h = c(tail(dist.B, 1), min(dist.B)),
            lty = "dotted", col = "red")
        abline(h = c(max(dist.Omega), tail(dist.Omega, 1), min(dist.Omega)) / max(dist.Omega),
            lty = "dotted", col = "blue")

    })
    legend("topright", col = c("red", "blue", "darkgreen"), lty = 1, lwd = 2,
        legend = c("dist.B", "dist.Omega", "loss"), bty = "n")

    zlim <- max(abs(range(Omega.true, Omega.init, Omega.gmlm))) * c(-1, 1)
    matrixImage(Omega.true,   main = "true",   zlim = zlim, add.values = TRUE, col = col, cex = cex)
    matrixImage(round(Omega.init, 2), main = "init (cond. prob.)", zlim = zlim, add.values = TRUE, col = col, cex = cex)
    mtext(round(norm(Omega.true - Omega.init, "F"), 3), 3)
    matrixImage(round(Omega.gmlm, 2),   main = "gmlm (ising)",   zlim = zlim, add.values = TRUE, col = col, cex = cex,
        col.values = c(par("col"), "red")[`[<-`(array(1, rep(prod(dim(X)[-sample.axis]), 2)), degen.mask, 2)])
    mtext(round(norm(Omega.true - Omega.gmlm, "F"), 3), 3)

    zlim <- max(abs(range(B.true, B.normal, B.gmlm))) * c(-1, 1)
    matrixImage(B.true,             main = "true",
        zlim = zlim, add.values = TRUE, col = col, cex = cex)
    matrixImage(round(B.normal, 2), main = "init (normal)",
        zlim = zlim, add.values = TRUE, axes = FALSE, col = col, cex = cex)
    mtext(round(dist.subspace(B.true, B.normal, normalize = TRUE), 3), 3)
    matrixImage(round(B.gmlm, 2),   main = "gmlm (ising)",
        zlim = zlim, add.values = TRUE, axes = FALSE, col = col, cex = cex)
    mtext(round(dist.subspace(B.true, B.gmlm, normalize = TRUE), 3), 3)
})

}