library(mvbernoulli)

printMVBinary <- function(Y) {
    Y <- array(as.integer(Y), dim = dim(Y))
    eventIndex <- seq_len(nrow(Y))
    eventNr <- apply(Y, 1, function(y) sum(y * 2^(rev(seq_len(p)) - 1)))
    dimnames(Y) <- list(
        "Index/Event" = paste(eventIndex, eventNr, sep = "/"),
        "Bit Index" = as.character(rev(seq_len(p)) - 1)
    )
    print.table(Y, zero.print = ".")
}


n <- 100
p <- 6

(theta <- rnorm(p * (p + 1) / 2))

pi <- ising_cond_probs(theta)
all.equal(
    theta,
    ising_theta_from_cond_prob(pi)
)

tensorPredictors::matrixImage({
    Theta <- matrix(NA, p, p)
    Theta[lower.tri(Theta, diag = TRUE)] <- theta
    Theta[upper.tri(Theta)] <- t(Theta)[upper.tri(Theta)]
    Theta
}, main = expression(paste("natural Params ", Theta)))
tensorPredictors::matrixImage({
    PI <- matrix(NA, p, p)
    PI[lower.tri(PI, diag = TRUE)] <- ising_cond_probs(theta)
    PI[upper.tri(PI)] <- t(PI)[upper.tri(PI)]
    PI
}, main = expression(paste("Conditional Probs. P(", Y[i], " = ", Y[j], " = 1", " | ", Y[-i - j], " = ", 0, ")")))
tensorPredictors::matrixImage({
    MAR <- matrix(NA, p, p)
    MAR[lower.tri(MAR, diag = TRUE)] <- ising_marginal_probs(theta)
    MAR[upper.tri(MAR)] <- t(MAR)[upper.tri(MAR)]
    MAR
}, main = expression(paste("Marginal Probs. P(", Y[i], " = ", Y[j], " = 1)")))

Y <- matrix(sample(c(TRUE, FALSE), n * p, replace = TRUE), n)
printMVBinary(Y)


allY <- function(p) {
    events <- c(FALSE, TRUE)
    for (. in seq_len(p - 1)) {
        events <- rbind(
            cbind(FALSE, events),
            cbind( TRUE, events)
        )
    }
    events
}
printMVBinary(allY(p))

G <- ising_score(theta, Y)
# Numeric gradiend
log.likelihood <- function(theta, Y) {
    p <- ncol(Y)
    # check sizes
    stopifnot(p * (p + 1) == 2 * length(theta))
    # and reverse column order
    # this is needed as internally the left are the high bits (high index) and
    # the right are the low bits (low index) which means for matching indices
    # we need to reverse the column order
    Y <- Y[, rev(seq_len(p)), drop = FALSE]
    # calc scaling factor
    sum_0 <- sum(exp(
        theta %*% apply(allY(p), 1, function(y) outer(y, y, `&`))[lower.tri(diag(p), diag = TRUE), ]
    ))
    # evaluate log likelihood
    -log(sum_0) + mean(
        theta %*% apply(Y, 1, function(y) outer(y, y, `&`))[lower.tri(diag(p), diag = TRUE), ]
    )
}

G.num <- local({
    h <- 1e-6
    mapply(function(i) {
        delta <- h * (seq_along(theta) == i)
        (log.likelihood(theta + delta, Y) - log.likelihood(theta - delta, Y)) / (2 * h)
    }, seq_along(theta))
})

data.frame(G, G.num)


for (n in c(2, 7, 12, 13, 14)) {
    for (p in 1:4) {
        cat(sprintf("%6d / %6d\n", sum(mapply(choose, n, 0:p)), nrSubSets(n, p)))
    }
}


p <- 5
(A <- tcrossprod(apply(allY(p), 1, function(y) outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)])))

print.table(B <- ising_fisher_info(theta), zero.print = ".")

all.equal(A[lower.tri(A, TRUE)], B[lower.tri(B, TRUE)])

ising_fisher_info.R <- function(theta, p) {
    stopifnot(2 * length(theta) == p * (p + 1))

    Y <- allY(p)

    # Ising model scaling factor for `P(Y = y) = p_0 exp(vech(y y')' theta)`
    sum_0 <- sum(apply(Y, 1, function(y) {
        vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
        exp(sum(vechYY * theta))
    }))
    p_0 <- 1 / sum_0

    # E[vech(Y Y')]
    EvechYY <- p_0 * rowSums(apply(Y, 1, function(y) {
        vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
        exp(sum(vechYY * theta)) * vechYY
    }))

    # E[vech(Y Y') vech(Y Y')']
    EvechYYvechYY <- p_0 * matrix(rowSums(apply(Y, 1, function(y) {
        vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
        exp(sum(vechYY * theta)) * outer(vechYY, vechYY)
    })), p * (p + 1) / 2)

    # Cov(vech(Y Y'), vech(Y Y')) = E[vech(Y Y') vech(Y Y')'] - E[vech(Y Y')] E[vech(Y Y')]'
    EvechYYvechYY - outer(EvechYY, EvechYY)
}

all.equal(
    ising_fisher_info.R(theta, p),
    ising_fisher_info(theta)
)

p <- 10
theta <- rnorm(p * (p + 1) / 2)
microbenchmark::microbenchmark(
    ising_fisher_info.R(theta, p),
    ising_fisher_info(theta)
)


ising_fisher_scoring <- function(Y) {
    # initial estimate (guess)
    ltri <- which(lower.tri(diag(p), diag = TRUE))
    theta <- ising_theta_from_cond_prob(rowMeans(apply(Y, 1, function(y) outer(y, y, `&`)[ltri])))

    print(theta)

    ll <- log.likelihood(theta, Y)

    # iterate Fisher scoring
    for (iter in 1:20) {
        theta <- theta + solve(ising_fisher_info(theta), ising_score(theta, Y))

        ll <- c(ll, log.likelihood(theta, Y))

        cat("ll: ", tail(ll, 1), "\n")
    }

    theta
}
ising_fisher_scoring(Y)


microbenchmark::microbenchmark(
    cov.mvbinary(Y),        # double copy (TODO: change MVBinary conversion/SEXP binding)
    cov(Y),                 # call the next expr. through default args
    .Call(stats:::C_cov, Y, NULL, na.method = 4L, FALSE)
)


################################################################################
###                         Conditional Ising Model                          ###
################################################################################
n <- 1000
p <- 10
q <- 10

alpha <- matrix(rnorm(p * q), p)
X <- matrix(rnorm(n * p), n)
theta <- function(alpha, x) {
    Theta <- crossprod(crossprod(x, alpha))
    diag(Theta) <- 0.5 * diag(Theta)
    2 * Theta[lower.tri(diag(ncol(alpha)), diag = TRUE)]
}
# sample Y ~ P( . | X = x) for x in X
system.time(Y <- apply(X, 1, function(x) ising_sample(1, theta(alpha, x))))
attr(Y, "p") <- as.integer(q)
class(Y) <- "mvbinary"

# For the numeric gradient comparison
allY <- function(p) {
    events <- c(FALSE, TRUE)
    for (. in seq_len(p - 1)) {
        events <- rbind(
            cbind(FALSE, events),
            cbind( TRUE, events)
        )
    }
    events
}
ising_conditional_log_likelihood.R <- function(alpha, X, Y) {
    # convert Y to a binary matrix
    Y <- as.mvbmatrix(Y)
    #retrieve dimensions
    n <- nrow(X)
    p <- ncol(X)
    q <- ncol(Y)
    # check dimensions
    stopifnot({
        nrow(Y) == n
        all(dim(alpha) == c(p, q))
    })

    # setup reused internal variables
    vech_index <- which(lower.tri(diag(q), diag = TRUE))
    aaY <- apply(allY(q), 1, function(y) outer(y, y, `&`))[vech_index, ]

    # sum over all observations
    ll <- 0
    for (i in seq_len(n)) {
        # Theta = alpha' x x' alpha
        Theta <- crossprod(crossprod(X[i, ], alpha))
        # theta = vech((2 1_q 1_q' - I_q) o Theta)
        theta <- ((2 - diag(q)) * Theta)[vech_index]

        # scaling factor `p_0^-1 = sum_y exp(vech(y y')' theta)`
        sum_0 <- sum(exp(theta %*% aaY))

        print(log(sum_0))

        # evaluate log likelihood
        ll <- ll + sum(theta * outer(Y[i, ], Y[i, ], `&`)[vech_index]) - log(sum_0)
    }

    ll / n
}
# numeric gradiend (score of the log-likelihood)
ising_conditional_score.R <- function(alpha, X, Y, h = 1e-6) {
    matrix(mapply(function(i) {
        delta <- h * (seq_along(alpha) == i)
        (ising_conditional_log_likelihood.R(alpha + delta, X, Y) -
            ising_conditional_log_likelihood.R(alpha - delta, X, Y)) / (2 * h)
    }, seq_along(alpha)), nrow(alpha))
}

stopifnot(all.equal(
    ising_conditional_log_likelihood.R(alpha, X, Y),
    ising_conditional_log_likelihood(alpha, X, Y)
))
microbenchmark::microbenchmark(
    ising_conditional_log_likelihood.R(alpha, X, Y),
    ising_conditional_log_likelihood(alpha, X, Y)
)

stopifnot(all.equal(
    ising_conditional_score.R(alpha, X, Y),
    ising_conditional_score(alpha, X, Y)
))
microbenchmark::microbenchmark(
    ising_conditional_score.R(alpha, X, Y),
    ising_conditional_score(alpha, X, Y)
)

################################################################################
###                       Fit Conditional Ising Model                        ###
################################################################################

ising_conditional_fit <- function(X, Y, ..., callback = NULL) {
    # get and check dimensions
    n <- if (is.null(nrow(Y))) length(Y) else nrow(Y)
    p <- ncol(X)
    q <- if (is.null(ncol(Y))) attr(Y, "p") else ncol(Y)
    # check dimensions
    stopifnot(nrow(X) == n)

    ### Initial value estimate
    # SVD of the predictor covariance estimate `Sigma = U_Sigma D_Sigma U_Sigma'`
    SigmaSVD <- La.svd(cov(X), min(p, q), 0)

    # Estimate `pi` as the single and two way effect means (approx conditional
    # probabilities through the marginal probability estimate)
    pi <- mean.mvbinary(Y, twoway = TRUE)

    # convert conditional probabilities into natural parameters (log-odds)
    theta <- ising_theta_from_cond_prob(pi)

    # convert natural parameters `theta` to square matrix form `Theta`
    Theta <- matrix(NA, q, q)
    Theta[lower.tri(diag(q), diag = TRUE)] <- theta
    Theta[upper.tri(diag(q))] <- t(Theta)[upper.tri(diag(q))]
    Theta <- (0.5 + diag(0.5, q, q)) * Theta

    # SVD of `Theta`
    ThetaSVD <- La.svd(Theta, min(p, q), 0)

    # Finally, initial `alpha` parameter estimate
    #   `alpha_0 = U_Sigma D_Sigma^-1/2 D_Theta^1/2 U_Theta'`
    alpha <- with(list(S = SigmaSVD, T = ThetaSVD), {
        S$u %*% diag(sqrt(T$d[seq_len(min(p, q))] / S$d[seq_len(min(p, q))])) %*% t(T$u)
    })

    ### Optimize log-likelihood for `alpha`
    tensorPredictors::NAGD(
        fun.loss = function(alpha) -ising_conditional_log_likelihood(alpha, X, Y),
        fun.grad = function(alpha) -ising_conditional_score(alpha, X, Y),
        params = alpha,
        ...,
        callback = callback
    )
}

n <- 1000
p <- 7
q <- 9

alpha.true <- matrix(rnorm(p * q), p)
X <- matrix(rnorm(n * p), n)
theta <- function(alpha, x) {
    Theta <- crossprod(crossprod(x, alpha))
    diag(Theta) <- 0.5 * diag(Theta)
    2 * Theta[lower.tri(diag(ncol(alpha)), diag = TRUE)]
}
# sample Y ~ P( . | X = x) for x in X
Y <- apply(X, 1, function(x) ising_sample(1, theta(alpha.true, x)))
attr(Y, "p") <- as.integer(q)

max.iter <- 100L
ising_conditional_fit(X, Y, max.iter = max.iter, callback = function(iter, alpha) {
    cat(sprintf(
        "%4d/%4d - diff: %12.4f - ll: %12.4f\n",
        iter, max.iter,
        min(norm(alpha - alpha.true, "F"), norm(alpha + alpha.true, "F")),
        ising_conditional_log_likelihood(alpha, X, Y)
    ))
})


ising_conditional_log_likelihood(alpha.true, X, Y)
ising_conditional_log_likelihood.R(alpha.true, X, Y)

for (. in 1:10) {
    print(ising_conditional_log_likelihood(matrix(rnorm(p * q), p, q), X, Y))
}

YY <- as.mvbmatrix(Y)
microbenchmark::microbenchmark(
    mean.mvbinary(Y, twoway = TRUE),
    rowMeans(apply(YY, 1, function(y) outer(y, y, `&`)))[lower.tri(diag(q), diag = TRUE)]
)

par(mfrow = c(2, 2))
tensorPredictors::matrixImage(alpha)
tensorPredictors::matrixImage(alpha.true)
tensorPredictors::matrixImage(alpha)
tensorPredictors::matrixImage(-alpha.true)