tensor_predictors/mvbernoulli/inst/examples/ising_grad.R

library(mvbernoulli)

printMVBinary <- function(Y) {
    Y <- array(as.integer(Y), dim = dim(Y))
    eventIndex <- seq_len(nrow(Y))
    eventNr <- apply(Y, 1, function(y) sum(y * 2^(rev(seq_len(p)) - 1)))
    dimnames(Y) <- list(
        "Index/Event" = paste(eventIndex, eventNr, sep = "/"),
        "Bit Index" = as.character(rev(seq_len(p)) - 1)
    )
    print.table(Y, zero.print = ".")
}


n <- 100
p <- 6

(theta <- rnorm(p * (p + 1) / 2))

pi <- ising_cond_probs(theta)
all.equal(
    theta,
    ising_theta_from_cond_prob(pi)
)

tensorPredictors::matrixImage({
    Theta <- matrix(NA, p, p)
    Theta[lower.tri(Theta, diag = TRUE)] <- theta
    Theta[upper.tri(Theta)] <- t(Theta)[upper.tri(Theta)]
    Theta
}, main = expression(paste("natural Params ", Theta)))
tensorPredictors::matrixImage({
    PI <- matrix(NA, p, p)
    PI[lower.tri(PI, diag = TRUE)] <- ising_cond_probs(theta)
    PI[upper.tri(PI)] <- t(PI)[upper.tri(PI)]
    PI
}, main = expression(paste("Conditional Probs. P(", Y[i], " = ", Y[j], " = 1", " | ", Y[-i - j], " = ", 0, ")")))
tensorPredictors::matrixImage({
    MAR <- matrix(NA, p, p)
    MAR[lower.tri(MAR, diag = TRUE)] <- ising_marginal_probs(theta)
    MAR[upper.tri(MAR)] <- t(MAR)[upper.tri(MAR)]
    MAR
}, main = expression(paste("Marginal Probs. P(", Y[i], " = ", Y[j], " = 1)")))

Y <- matrix(sample(c(TRUE, FALSE), n * p, replace = TRUE), n)
printMVBinary(Y)


allY <- function(p) {
    events <- c(FALSE, TRUE)
    for (. in seq_len(p - 1)) {
        events <- rbind(
            cbind(FALSE, events),
            cbind( TRUE, events)
        )
    }
    events
}
printMVBinary(allY(p))

G <- ising_score(theta, Y)
# Numeric gradiend
log.likelihood <- function(theta, Y) {
    p <- ncol(Y)
    # check sizes
    stopifnot(p * (p + 1) == 2 * length(theta))
    # and reverse column order
    # this is needed as internally the left are the high bits (high index) and
    # the right are the low bits (low index) which means for matching indices
    # we need to reverse the column order
    Y <- Y[, rev(seq_len(p)), drop = FALSE]
    # calc scaling factor
    sum_0 <- sum(exp(
        theta %*% apply(allY(p), 1, function(y) outer(y, y, `&`))[lower.tri(diag(p), diag = TRUE), ]
    ))
    # evaluate log likelihood
    -log(sum_0) + mean(
        theta %*% apply(Y, 1, function(y) outer(y, y, `&`))[lower.tri(diag(p), diag = TRUE), ]
    )
}

G.num <- local({
    h <- 1e-6
    mapply(function(i) {
        delta <- h * (seq_along(theta) == i)
        (log.likelihood(theta + delta, Y) - log.likelihood(theta - delta, Y)) / (2 * h)
    }, seq_along(theta))
})

data.frame(G, G.num)


for (n in c(2, 7, 12, 13, 14)) {
    for (p in 1:4) {
        cat(sprintf("%6d / %6d\n", sum(mapply(choose, n, 0:p)), nrSubSets(n, p)))
    }
}


p <- 5
(A <- tcrossprod(apply(allY(p), 1, function(y) outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)])))

print.table(B <- ising_fisher_info(theta), zero.print = ".")

all.equal(A[lower.tri(A, TRUE)], B[lower.tri(B, TRUE)])

ising_fisher_info.R <- function(theta, p) {
    stopifnot(2 * length(theta) == p * (p + 1))

    Y <- allY(p)

    # Ising model scaling factor for `P(Y = y) = p_0 exp(vech(y y')' theta)`
    sum_0 <- sum(apply(Y, 1, function(y) {
        vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
        exp(sum(vechYY * theta))
    }))
    p_0 <- 1 / sum_0

    # E[vech(Y Y')]
    EvechYY <- p_0 * rowSums(apply(Y, 1, function(y) {
        vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
        exp(sum(vechYY * theta)) * vechYY
    }))

    # E[vech(Y Y') vech(Y Y')']
    EvechYYvechYY <- p_0 * matrix(rowSums(apply(Y, 1, function(y) {
        vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
        exp(sum(vechYY * theta)) * outer(vechYY, vechYY)
    })), p * (p + 1) / 2)

    # Cov(vech(Y Y'), vech(Y Y')) = E[vech(Y Y') vech(Y Y')'] - E[vech(Y Y')] E[vech(Y Y')]'
    EvechYYvechYY - outer(EvechYY, EvechYY)
}

all.equal(
    ising_fisher_info.R(theta, p),
    ising_fisher_info(theta)
)

p <- 10
theta <- rnorm(p * (p + 1) / 2)
microbenchmark::microbenchmark(
    ising_fisher_info.R(theta, p),
    ising_fisher_info(theta)
)


ising_fisher_scoring <- function(Y) {
    # initial estimate (guess)
    ltri <- which(lower.tri(diag(p), diag = TRUE))
    theta <- ising_theta_from_cond_prob(rowMeans(apply(Y, 1, function(y) outer(y, y, `&`)[ltri])))

    print(theta)

    ll <- log.likelihood(theta, Y)

    # iterate Fisher scoring
    for (iter in 1:20) {
        theta <- theta + solve(ising_fisher_info(theta), ising_score(theta, Y))

        ll <- c(ll, log.likelihood(theta, Y))

        cat("ll: ", tail(ll, 1), "\n")
    }

    theta
}
ising_fisher_scoring(Y)


microbenchmark::microbenchmark(
    cov.mvbinary(Y),        # double copy (TODO: change MVBinary conversion/SEXP binding)
    cov(Y),                 # call the next expr. through default args
    .Call(stats:::C_cov, Y, NULL, na.method = 4L, FALSE)
)


################################################################################
###                         Conditional Ising Model                          ###
################################################################################
n <- 1000
p <- 10
q <- 10

alpha <- matrix(rnorm(p * q), p)
X <- matrix(rnorm(n * p), n)
theta <- function(alpha, x) {
    Theta <- crossprod(crossprod(x, alpha))
    diag(Theta) <- 0.5 * diag(Theta)
    2 * Theta[lower.tri(diag(ncol(alpha)), diag = TRUE)]
}
# sample Y ~ P( . | X = x) for x in X
system.time(Y <- apply(X, 1, function(x) ising_sample(1, theta(alpha, x))))
attr(Y, "p") <- as.integer(q)
class(Y) <- "mvbinary"

# For the numeric gradient comparison
allY <- function(p) {
    events <- c(FALSE, TRUE)
    for (. in seq_len(p - 1)) {
        events <- rbind(
            cbind(FALSE, events),
            cbind( TRUE, events)
        )
    }
    events
}
ising_conditional_log_likelihood.R <- function(alpha, X, Y) {
    # convert Y to a binary matrix
    Y <- as.mvbmatrix(Y)
    #retrieve dimensions
    n <- nrow(X)
    p <- ncol(X)
    q <- ncol(Y)
    # check dimensions
    stopifnot({
        nrow(Y) == n
        all(dim(alpha) == c(p, q))
    })

    # setup reused internal variables
    vech_index <- which(lower.tri(diag(q), diag = TRUE))
    aaY <- apply(allY(q), 1, function(y) outer(y, y, `&`))[vech_index, ]

    # sum over all observations
    ll <- 0
    for (i in seq_len(n)) {
        # Theta = alpha' x x' alpha
        Theta <- crossprod(crossprod(X[i, ], alpha))
        # theta = vech((2 1_q 1_q' - I_q) o Theta)
        theta <- ((2 - diag(q)) * Theta)[vech_index]

        # scaling factor `p_0^-1 = sum_y exp(vech(y y')' theta)`
        sum_0 <- sum(exp(theta %*% aaY))

        print(log(sum_0))

        # evaluate log likelihood
        ll <- ll + sum(theta * outer(Y[i, ], Y[i, ], `&`)[vech_index]) - log(sum_0)
    }

    ll / n
}
# numeric gradiend (score of the log-likelihood)
ising_conditional_score.R <- function(alpha, X, Y, h = 1e-6) {
    matrix(mapply(function(i) {
        delta <- h * (seq_along(alpha) == i)
        (ising_conditional_log_likelihood.R(alpha + delta, X, Y) -
            ising_conditional_log_likelihood.R(alpha - delta, X, Y)) / (2 * h)
    }, seq_along(alpha)), nrow(alpha))
}

stopifnot(all.equal(
    ising_conditional_log_likelihood.R(alpha, X, Y),
    ising_conditional_log_likelihood(alpha, X, Y)
))
microbenchmark::microbenchmark(
    ising_conditional_log_likelihood.R(alpha, X, Y),
    ising_conditional_log_likelihood(alpha, X, Y)
)

stopifnot(all.equal(
    ising_conditional_score.R(alpha, X, Y),
    ising_conditional_score(alpha, X, Y)
))
microbenchmark::microbenchmark(
    ising_conditional_score.R(alpha, X, Y),
    ising_conditional_score(alpha, X, Y)
)

################################################################################
###                       Fit Conditional Ising Model                        ###
################################################################################

ising_conditional_fit <- function(X, Y, ..., callback = NULL) {
    # get and check dimensions
    n <- if (is.null(nrow(Y))) length(Y) else nrow(Y)
    p <- ncol(X)
    q <- if (is.null(ncol(Y))) attr(Y, "p") else ncol(Y)
    # check dimensions
    stopifnot(nrow(X) == n)

    ### Initial value estimate
    # SVD of the predictor covariance estimate `Sigma = U_Sigma D_Sigma U_Sigma'`
    SigmaSVD <- La.svd(cov(X), min(p, q), 0)

    # Estimate `pi` as the single and two way effect means (approx conditional
    # probabilities through the marginal probability estimate)
    pi <- mean.mvbinary(Y, twoway = TRUE)

    # convert conditional probabilities into natural parameters (log-odds)
    theta <- ising_theta_from_cond_prob(pi)

    # convert natural parameters `theta` to square matrix form `Theta`
    Theta <- matrix(NA, q, q)
    Theta[lower.tri(diag(q), diag = TRUE)] <- theta
    Theta[upper.tri(diag(q))] <- t(Theta)[upper.tri(diag(q))]
    Theta <- (0.5 + diag(0.5, q, q)) * Theta

    # SVD of `Theta`
    ThetaSVD <- La.svd(Theta, min(p, q), 0)

    # Finally, initial `alpha` parameter estimate
    #   `alpha_0 = U_Sigma D_Sigma^-1/2 D_Theta^1/2 U_Theta'`
    alpha <- with(list(S = SigmaSVD, T = ThetaSVD), {
        S$u %*% diag(sqrt(T$d[seq_len(min(p, q))] / S$d[seq_len(min(p, q))])) %*% t(T$u)
    })

    ### Optimize log-likelihood for `alpha`
    tensorPredictors::NAGD(
        fun.loss = function(alpha) -ising_conditional_log_likelihood(alpha, X, Y),
        fun.grad = function(alpha) -ising_conditional_score(alpha, X, Y),
        params = alpha,
        ...,
        callback = callback
    )
}

n <- 1000
p <- 7
q <- 9

alpha.true <- matrix(rnorm(p * q), p)
X <- matrix(rnorm(n * p), n)
theta <- function(alpha, x) {
    Theta <- crossprod(crossprod(x, alpha))
    diag(Theta) <- 0.5 * diag(Theta)
    2 * Theta[lower.tri(diag(ncol(alpha)), diag = TRUE)]
}
# sample Y ~ P( . | X = x) for x in X
Y <- apply(X, 1, function(x) ising_sample(1, theta(alpha.true, x)))
attr(Y, "p") <- as.integer(q)

max.iter <- 100L
ising_conditional_fit(X, Y, max.iter = max.iter, callback = function(iter, alpha) {
    cat(sprintf(
        "%4d/%4d - diff: %12.4f - ll: %12.4f\n",
        iter, max.iter,
        min(norm(alpha - alpha.true, "F"), norm(alpha + alpha.true, "F")),
        ising_conditional_log_likelihood(alpha, X, Y)
    ))
})


ising_conditional_log_likelihood(alpha.true, X, Y)
ising_conditional_log_likelihood.R(alpha.true, X, Y)

for (. in 1:10) {
    print(ising_conditional_log_likelihood(matrix(rnorm(p * q), p, q), X, Y))
}

YY <- as.mvbmatrix(Y)
microbenchmark::microbenchmark(
    mean.mvbinary(Y, twoway = TRUE),
    rowMeans(apply(YY, 1, function(y) outer(y, y, `&`)))[lower.tri(diag(q), diag = TRUE)]
)

par(mfrow = c(2, 2))
tensorPredictors::matrixImage(alpha)
tensorPredictors::matrixImage(alpha.true)
tensorPredictors::matrixImage(alpha)
tensorPredictors::matrixImage(-alpha.true)
add: mvbernoulli, wip: tensorPredictors, add: GMLM 2022-10-06 12:25:40 +00:00			`library(mvbernoulli)`

			`printMVBinary <- function(Y) {`
			`Y <- array(as.integer(Y), dim = dim(Y))`
			`eventIndex <- seq_len(nrow(Y))`
			`eventNr <- apply(Y, 1, function(y) sum(y * 2^(rev(seq_len(p)) - 1)))`
			`dimnames(Y) <- list(`
			`"Index/Event" = paste(eventIndex, eventNr, sep = "/"),`
			`"Bit Index" = as.character(rev(seq_len(p)) - 1)`
			`)`
			`print.table(Y, zero.print = ".")`
			`}`


			`n <- 100`
			`p <- 6`

			`(theta <- rnorm(p * (p + 1) / 2))`

			`pi <- ising_cond_probs(theta)`
			`all.equal(`
			`theta,`
			`ising_theta_from_cond_prob(pi)`
			`)`

			`tensorPredictors::matrixImage({`
			`Theta <- matrix(NA, p, p)`
			`Theta[lower.tri(Theta, diag = TRUE)] <- theta`
			`Theta[upper.tri(Theta)] <- t(Theta)[upper.tri(Theta)]`
			`Theta`
			`}, main = expression(paste("natural Params ", Theta)))`
			`tensorPredictors::matrixImage({`
			`PI <- matrix(NA, p, p)`
			`PI[lower.tri(PI, diag = TRUE)] <- ising_cond_probs(theta)`
			`PI[upper.tri(PI)] <- t(PI)[upper.tri(PI)]`
			`PI`
			`}, main = expression(paste("Conditional Probs. P(", Y[i], " = ", Y[j], " = 1", " \| ", Y[-i - j], " = ", 0, ")")))`
			`tensorPredictors::matrixImage({`
			`MAR <- matrix(NA, p, p)`
			`MAR[lower.tri(MAR, diag = TRUE)] <- ising_marginal_probs(theta)`
			`MAR[upper.tri(MAR)] <- t(MAR)[upper.tri(MAR)]`
			`MAR`
			`}, main = expression(paste("Marginal Probs. P(", Y[i], " = ", Y[j], " = 1)")))`

			`Y <- matrix(sample(c(TRUE, FALSE), n * p, replace = TRUE), n)`
			`printMVBinary(Y)`


			`allY <- function(p) {`
			`events <- c(FALSE, TRUE)`
			`for (. in seq_len(p - 1)) {`
			`events <- rbind(`
			`cbind(FALSE, events),`
			`cbind( TRUE, events)`
			`)`
			`}`
			`events`
			`}`
			`printMVBinary(allY(p))`

			`G <- ising_score(theta, Y)`
			`# Numeric gradiend`
			`log.likelihood <- function(theta, Y) {`
			`p <- ncol(Y)`
			`# check sizes`
			`stopifnot(p * (p + 1) == 2 * length(theta))`
			`# and reverse column order`
			`# this is needed as internally the left are the high bits (high index) and`
			`# the right are the low bits (low index) which means for matching indices`
			`# we need to reverse the column order`
			`Y <- Y[, rev(seq_len(p)), drop = FALSE]`
			`# calc scaling factor`
			`sum_0 <- sum(exp(`
			theta %*% apply(allY(p), 1, function(y) outer(y, y, `&`))[lower.tri(diag(p), diag = TRUE), ]
			`))`
			`# evaluate log likelihood`
			`-log(sum_0) + mean(`
			theta %*% apply(Y, 1, function(y) outer(y, y, `&`))[lower.tri(diag(p), diag = TRUE), ]
			`)`
			`}`

			`G.num <- local({`
			`h <- 1e-6`
			`mapply(function(i) {`
			`delta <- h * (seq_along(theta) == i)`
			`(log.likelihood(theta + delta, Y) - log.likelihood(theta - delta, Y)) / (2 * h)`
			`}, seq_along(theta))`
			`})`

			`data.frame(G, G.num)`


			`for (n in c(2, 7, 12, 13, 14)) {`
			`for (p in 1:4) {`
			`cat(sprintf("%6d / %6d\n", sum(mapply(choose, n, 0:p)), nrSubSets(n, p)))`
			`}`
			`}`


			`p <- 5`
			(A <- tcrossprod(apply(allY(p), 1, function(y) outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)])))

			`print.table(B <- ising_fisher_info(theta), zero.print = ".")`

			`all.equal(A[lower.tri(A, TRUE)], B[lower.tri(B, TRUE)])`

			`ising_fisher_info.R <- function(theta, p) {`
			`stopifnot(2 * length(theta) == p * (p + 1))`

			`Y <- allY(p)`

			# Ising model scaling factor for `P(Y = y) = p_0 exp(vech(y y')' theta)`
			`sum_0 <- sum(apply(Y, 1, function(y) {`
			vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
			`exp(sum(vechYY * theta))`
			`}))`
			`p_0 <- 1 / sum_0`

			`# E[vech(Y Y')]`
			`EvechYY <- p_0 * rowSums(apply(Y, 1, function(y) {`
			vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
			`exp(sum(vechYY * theta)) * vechYY`
			`}))`

			`# E[vech(Y Y') vech(Y Y')']`
			`EvechYYvechYY <- p_0 * matrix(rowSums(apply(Y, 1, function(y) {`
			vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
			`exp(sum(vechYY * theta)) * outer(vechYY, vechYY)`
			`})), p * (p + 1) / 2)`

			`# Cov(vech(Y Y'), vech(Y Y')) = E[vech(Y Y') vech(Y Y')'] - E[vech(Y Y')] E[vech(Y Y')]'`
			`EvechYYvechYY - outer(EvechYY, EvechYY)`
			`}`

			`all.equal(`
			`ising_fisher_info.R(theta, p),`
			`ising_fisher_info(theta)`
			`)`

			`p <- 10`
			`theta <- rnorm(p * (p + 1) / 2)`
			`microbenchmark::microbenchmark(`
			`ising_fisher_info.R(theta, p),`
			`ising_fisher_info(theta)`
			`)`


			`ising_fisher_scoring <- function(Y) {`
			`# initial estimate (guess)`
			`ltri <- which(lower.tri(diag(p), diag = TRUE))`
			theta <- ising_theta_from_cond_prob(rowMeans(apply(Y, 1, function(y) outer(y, y, `&`)[ltri])))

			`print(theta)`

			`ll <- log.likelihood(theta, Y)`

			`# iterate Fisher scoring`
			`for (iter in 1:20) {`
			`theta <- theta + solve(ising_fisher_info(theta), ising_score(theta, Y))`

			`ll <- c(ll, log.likelihood(theta, Y))`

			`cat("ll: ", tail(ll, 1), "\n")`
			`}`

			`theta`
			`}`
			`ising_fisher_scoring(Y)`



			`microbenchmark::microbenchmark(`
			`cov.mvbinary(Y), # double copy (TODO: change MVBinary conversion/SEXP binding)`
			`cov(Y), # call the next expr. through default args`
			`.Call(stats:::C_cov, Y, NULL, na.method = 4L, FALSE)`
			`)`


			`################################################################################`
			`### Conditional Ising Model ###`
			`################################################################################`
			`n <- 1000`
			`p <- 10`
			`q <- 10`

			`alpha <- matrix(rnorm(p * q), p)`
			`X <- matrix(rnorm(n * p), n)`
			`theta <- function(alpha, x) {`
			`Theta <- crossprod(crossprod(x, alpha))`
			`diag(Theta) <- 0.5 * diag(Theta)`
			`2 * Theta[lower.tri(diag(ncol(alpha)), diag = TRUE)]`
			`}`
			`# sample Y ~ P( . \| X = x) for x in X`
			`system.time(Y <- apply(X, 1, function(x) ising_sample(1, theta(alpha, x))))`
			`attr(Y, "p") <- as.integer(q)`
			`class(Y) <- "mvbinary"`

			`# For the numeric gradient comparison`
			`allY <- function(p) {`
			`events <- c(FALSE, TRUE)`
			`for (. in seq_len(p - 1)) {`
			`events <- rbind(`
			`cbind(FALSE, events),`
			`cbind( TRUE, events)`
			`)`
			`}`
			`events`
			`}`
			`ising_conditional_log_likelihood.R <- function(alpha, X, Y) {`
			`# convert Y to a binary matrix`
			`Y <- as.mvbmatrix(Y)`
			`#retrieve dimensions`
			`n <- nrow(X)`
			`p <- ncol(X)`
			`q <- ncol(Y)`
			`# check dimensions`
			`stopifnot({`
			`nrow(Y) == n`
			`all(dim(alpha) == c(p, q))`
			`})`

			`# setup reused internal variables`
			`vech_index <- which(lower.tri(diag(q), diag = TRUE))`
			aaY <- apply(allY(q), 1, function(y) outer(y, y, `&`))[vech_index, ]

			`# sum over all observations`
			`ll <- 0`
			`for (i in seq_len(n)) {`
			`# Theta = alpha' x x' alpha`
			`Theta <- crossprod(crossprod(X[i, ], alpha))`
			`# theta = vech((2 1_q 1_q' - I_q) o Theta)`
			`theta <- ((2 - diag(q)) * Theta)[vech_index]`

			# scaling factor `p_0^-1 = sum_y exp(vech(y y')' theta)`
			`sum_0 <- sum(exp(theta %*% aaY))`

			`print(log(sum_0))`

			`# evaluate log likelihood`
			ll <- ll + sum(theta * outer(Y[i, ], Y[i, ], `&`)[vech_index]) - log(sum_0)
			`}`

			`ll / n`
			`}`
			`# numeric gradiend (score of the log-likelihood)`
			`ising_conditional_score.R <- function(alpha, X, Y, h = 1e-6) {`
			`matrix(mapply(function(i) {`
			`delta <- h * (seq_along(alpha) == i)`
			`(ising_conditional_log_likelihood.R(alpha + delta, X, Y) -`
			`ising_conditional_log_likelihood.R(alpha - delta, X, Y)) / (2 * h)`
			`}, seq_along(alpha)), nrow(alpha))`
			`}`

			`stopifnot(all.equal(`
			`ising_conditional_log_likelihood.R(alpha, X, Y),`
			`ising_conditional_log_likelihood(alpha, X, Y)`
			`))`
			`microbenchmark::microbenchmark(`
			`ising_conditional_log_likelihood.R(alpha, X, Y),`
			`ising_conditional_log_likelihood(alpha, X, Y)`
			`)`

			`stopifnot(all.equal(`
			`ising_conditional_score.R(alpha, X, Y),`
			`ising_conditional_score(alpha, X, Y)`
			`))`
			`microbenchmark::microbenchmark(`
			`ising_conditional_score.R(alpha, X, Y),`
			`ising_conditional_score(alpha, X, Y)`
			`)`

			`################################################################################`
			`### Fit Conditional Ising Model ###`
			`################################################################################`

			`ising_conditional_fit <- function(X, Y, ..., callback = NULL) {`
			`# get and check dimensions`
			`n <- if (is.null(nrow(Y))) length(Y) else nrow(Y)`
			`p <- ncol(X)`
			`q <- if (is.null(ncol(Y))) attr(Y, "p") else ncol(Y)`
			`# check dimensions`
			`stopifnot(nrow(X) == n)`

			`### Initial value estimate`
			# SVD of the predictor covariance estimate `Sigma = U_Sigma D_Sigma U_Sigma'`
			`SigmaSVD <- La.svd(cov(X), min(p, q), 0)`

			# Estimate `pi` as the single and two way effect means (approx conditional
			`# probabilities through the marginal probability estimate)`
			`pi <- mean.mvbinary(Y, twoway = TRUE)`

			`# convert conditional probabilities into natural parameters (log-odds)`
			`theta <- ising_theta_from_cond_prob(pi)`

			# convert natural parameters `theta` to square matrix form `Theta`
			`Theta <- matrix(NA, q, q)`
			`Theta[lower.tri(diag(q), diag = TRUE)] <- theta`
			`Theta[upper.tri(diag(q))] <- t(Theta)[upper.tri(diag(q))]`
			`Theta <- (0.5 + diag(0.5, q, q)) * Theta`

			# SVD of `Theta`
			`ThetaSVD <- La.svd(Theta, min(p, q), 0)`

			# Finally, initial `alpha` parameter estimate
			# `alpha_0 = U_Sigma D_Sigma^-1/2 D_Theta^1/2 U_Theta'`
			`alpha <- with(list(S = SigmaSVD, T = ThetaSVD), {`
			`S$u %% diag(sqrt(T$d[seq_len(min(p, q))] / S$d[seq_len(min(p, q))])) %% t(T$u)`
			`})`

			### Optimize log-likelihood for `alpha`
			`tensorPredictors::NAGD(`
			`fun.loss = function(alpha) -ising_conditional_log_likelihood(alpha, X, Y),`
			`fun.grad = function(alpha) -ising_conditional_score(alpha, X, Y),`
			`params = alpha,`
			`...,`
			`callback = callback`
			`)`
			`}`

			`n <- 1000`
			`p <- 7`
			`q <- 9`

			`alpha.true <- matrix(rnorm(p * q), p)`
			`X <- matrix(rnorm(n * p), n)`
			`theta <- function(alpha, x) {`
			`Theta <- crossprod(crossprod(x, alpha))`
			`diag(Theta) <- 0.5 * diag(Theta)`
			`2 * Theta[lower.tri(diag(ncol(alpha)), diag = TRUE)]`
			`}`
			`# sample Y ~ P( . \| X = x) for x in X`
			`Y <- apply(X, 1, function(x) ising_sample(1, theta(alpha.true, x)))`
			`attr(Y, "p") <- as.integer(q)`

			`max.iter <- 100L`
			`ising_conditional_fit(X, Y, max.iter = max.iter, callback = function(iter, alpha) {`
			`cat(sprintf(`
			`"%4d/%4d - diff: %12.4f - ll: %12.4f\n",`
			`iter, max.iter,`
			`min(norm(alpha - alpha.true, "F"), norm(alpha + alpha.true, "F")),`
			`ising_conditional_log_likelihood(alpha, X, Y)`
			`))`
			`})`


			`ising_conditional_log_likelihood(alpha.true, X, Y)`
			`ising_conditional_log_likelihood.R(alpha.true, X, Y)`

			`for (. in 1:10) {`
			`print(ising_conditional_log_likelihood(matrix(rnorm(p * q), p, q), X, Y))`
			`}`

			`YY <- as.mvbmatrix(Y)`
			`microbenchmark::microbenchmark(`
			`mean.mvbinary(Y, twoway = TRUE),`
			rowMeans(apply(YY, 1, function(y) outer(y, y, `&`)))[lower.tri(diag(q), diag = TRUE)]
			`)`

			`par(mfrow = c(2, 2))`
			`tensorPredictors::matrixImage(alpha)`
			`tensorPredictors::matrixImage(alpha.true)`
			`tensorPredictors::matrixImage(alpha)`
			`tensorPredictors::matrixImage(-alpha.true)`