library(mvbernoulli) printMVBinary <- function(Y) { Y <- array(as.integer(Y), dim = dim(Y)) eventIndex <- seq_len(nrow(Y)) eventNr <- apply(Y, 1, function(y) sum(y * 2^(rev(seq_len(p)) - 1))) dimnames(Y) <- list( "Index/Event" = paste(eventIndex, eventNr, sep = "/"), "Bit Index" = as.character(rev(seq_len(p)) - 1) ) print.table(Y, zero.print = ".") } n <- 100 p <- 6 (theta <- rnorm(p * (p + 1) / 2)) pi <- ising_cond_probs(theta) all.equal( theta, ising_theta_from_cond_prob(pi) ) tensorPredictors::matrixImage({ Theta <- matrix(NA, p, p) Theta[lower.tri(Theta, diag = TRUE)] <- theta Theta[upper.tri(Theta)] <- t(Theta)[upper.tri(Theta)] Theta }, main = expression(paste("natural Params ", Theta))) tensorPredictors::matrixImage({ PI <- matrix(NA, p, p) PI[lower.tri(PI, diag = TRUE)] <- ising_cond_probs(theta) PI[upper.tri(PI)] <- t(PI)[upper.tri(PI)] PI }, main = expression(paste("Conditional Probs. P(", Y[i], " = ", Y[j], " = 1", " | ", Y[-i - j], " = ", 0, ")"))) tensorPredictors::matrixImage({ MAR <- matrix(NA, p, p) MAR[lower.tri(MAR, diag = TRUE)] <- ising_marginal_probs(theta) MAR[upper.tri(MAR)] <- t(MAR)[upper.tri(MAR)] MAR }, main = expression(paste("Marginal Probs. P(", Y[i], " = ", Y[j], " = 1)"))) Y <- matrix(sample(c(TRUE, FALSE), n * p, replace = TRUE), n) printMVBinary(Y) allY <- function(p) { events <- c(FALSE, TRUE) for (. in seq_len(p - 1)) { events <- rbind( cbind(FALSE, events), cbind( TRUE, events) ) } events } printMVBinary(allY(p)) G <- ising_score(theta, Y) # Numeric gradiend log.likelihood <- function(theta, Y) { p <- ncol(Y) # check sizes stopifnot(p * (p + 1) == 2 * length(theta)) # and reverse column order # this is needed as internally the left are the high bits (high index) and # the right are the low bits (low index) which means for matching indices # we need to reverse the column order Y <- Y[, rev(seq_len(p)), drop = FALSE] # calc scaling factor sum_0 <- sum(exp( theta %*% apply(allY(p), 1, function(y) outer(y, y, `&`))[lower.tri(diag(p), diag = TRUE), ] )) # evaluate log likelihood -log(sum_0) + mean( theta %*% apply(Y, 1, function(y) outer(y, y, `&`))[lower.tri(diag(p), diag = TRUE), ] ) } G.num <- local({ h <- 1e-6 mapply(function(i) { delta <- h * (seq_along(theta) == i) (log.likelihood(theta + delta, Y) - log.likelihood(theta - delta, Y)) / (2 * h) }, seq_along(theta)) }) data.frame(G, G.num) for (n in c(2, 7, 12, 13, 14)) { for (p in 1:4) { cat(sprintf("%6d / %6d\n", sum(mapply(choose, n, 0:p)), nrSubSets(n, p))) } } p <- 5 (A <- tcrossprod(apply(allY(p), 1, function(y) outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]))) print.table(B <- ising_fisher_info(theta), zero.print = ".") all.equal(A[lower.tri(A, TRUE)], B[lower.tri(B, TRUE)]) ising_fisher_info.R <- function(theta, p) { stopifnot(2 * length(theta) == p * (p + 1)) Y <- allY(p) # Ising model scaling factor for `P(Y = y) = p_0 exp(vech(y y')' theta)` sum_0 <- sum(apply(Y, 1, function(y) { vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)] exp(sum(vechYY * theta)) })) p_0 <- 1 / sum_0 # E[vech(Y Y')] EvechYY <- p_0 * rowSums(apply(Y, 1, function(y) { vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)] exp(sum(vechYY * theta)) * vechYY })) # E[vech(Y Y') vech(Y Y')'] EvechYYvechYY <- p_0 * matrix(rowSums(apply(Y, 1, function(y) { vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)] exp(sum(vechYY * theta)) * outer(vechYY, vechYY) })), p * (p + 1) / 2) # Cov(vech(Y Y'), vech(Y Y')) = E[vech(Y Y') vech(Y Y')'] - E[vech(Y Y')] E[vech(Y Y')]' EvechYYvechYY - outer(EvechYY, EvechYY) } all.equal( ising_fisher_info.R(theta, p), ising_fisher_info(theta) ) p <- 10 theta <- rnorm(p * (p + 1) / 2) microbenchmark::microbenchmark( ising_fisher_info.R(theta, p), ising_fisher_info(theta) ) ising_fisher_scoring <- function(Y) { # initial estimate (guess) ltri <- which(lower.tri(diag(p), diag = TRUE)) theta <- ising_theta_from_cond_prob(rowMeans(apply(Y, 1, function(y) outer(y, y, `&`)[ltri]))) print(theta) ll <- log.likelihood(theta, Y) # iterate Fisher scoring for (iter in 1:20) { theta <- theta + solve(ising_fisher_info(theta), ising_score(theta, Y)) ll <- c(ll, log.likelihood(theta, Y)) cat("ll: ", tail(ll, 1), "\n") } theta } ising_fisher_scoring(Y) microbenchmark::microbenchmark( cov.mvbinary(Y), # double copy (TODO: change MVBinary conversion/SEXP binding) cov(Y), # call the next expr. through default args .Call(stats:::C_cov, Y, NULL, na.method = 4L, FALSE) ) ################################################################################ ### Conditional Ising Model ### ################################################################################ n <- 1000 p <- 10 q <- 10 alpha <- matrix(rnorm(p * q), p) X <- matrix(rnorm(n * p), n) theta <- function(alpha, x) { Theta <- crossprod(crossprod(x, alpha)) diag(Theta) <- 0.5 * diag(Theta) 2 * Theta[lower.tri(diag(ncol(alpha)), diag = TRUE)] } # sample Y ~ P( . | X = x) for x in X system.time(Y <- apply(X, 1, function(x) ising_sample(1, theta(alpha, x)))) attr(Y, "p") <- as.integer(q) class(Y) <- "mvbinary" # For the numeric gradient comparison allY <- function(p) { events <- c(FALSE, TRUE) for (. in seq_len(p - 1)) { events <- rbind( cbind(FALSE, events), cbind( TRUE, events) ) } events } ising_conditional_log_likelihood.R <- function(alpha, X, Y) { # convert Y to a binary matrix Y <- as.mvbmatrix(Y) #retrieve dimensions n <- nrow(X) p <- ncol(X) q <- ncol(Y) # check dimensions stopifnot({ nrow(Y) == n all(dim(alpha) == c(p, q)) }) # setup reused internal variables vech_index <- which(lower.tri(diag(q), diag = TRUE)) aaY <- apply(allY(q), 1, function(y) outer(y, y, `&`))[vech_index, ] # sum over all observations ll <- 0 for (i in seq_len(n)) { # Theta = alpha' x x' alpha Theta <- crossprod(crossprod(X[i, ], alpha)) # theta = vech((2 1_q 1_q' - I_q) o Theta) theta <- ((2 - diag(q)) * Theta)[vech_index] # scaling factor `p_0^-1 = sum_y exp(vech(y y')' theta)` sum_0 <- sum(exp(theta %*% aaY)) print(log(sum_0)) # evaluate log likelihood ll <- ll + sum(theta * outer(Y[i, ], Y[i, ], `&`)[vech_index]) - log(sum_0) } ll / n } # numeric gradiend (score of the log-likelihood) ising_conditional_score.R <- function(alpha, X, Y, h = 1e-6) { matrix(mapply(function(i) { delta <- h * (seq_along(alpha) == i) (ising_conditional_log_likelihood.R(alpha + delta, X, Y) - ising_conditional_log_likelihood.R(alpha - delta, X, Y)) / (2 * h) }, seq_along(alpha)), nrow(alpha)) } stopifnot(all.equal( ising_conditional_log_likelihood.R(alpha, X, Y), ising_conditional_log_likelihood(alpha, X, Y) )) microbenchmark::microbenchmark( ising_conditional_log_likelihood.R(alpha, X, Y), ising_conditional_log_likelihood(alpha, X, Y) ) stopifnot(all.equal( ising_conditional_score.R(alpha, X, Y), ising_conditional_score(alpha, X, Y) )) microbenchmark::microbenchmark( ising_conditional_score.R(alpha, X, Y), ising_conditional_score(alpha, X, Y) ) ################################################################################ ### Fit Conditional Ising Model ### ################################################################################ ising_conditional_fit <- function(X, Y, ..., callback = NULL) { # get and check dimensions n <- if (is.null(nrow(Y))) length(Y) else nrow(Y) p <- ncol(X) q <- if (is.null(ncol(Y))) attr(Y, "p") else ncol(Y) # check dimensions stopifnot(nrow(X) == n) ### Initial value estimate # SVD of the predictor covariance estimate `Sigma = U_Sigma D_Sigma U_Sigma'` SigmaSVD <- La.svd(cov(X), min(p, q), 0) # Estimate `pi` as the single and two way effect means (approx conditional # probabilities through the marginal probability estimate) pi <- mean.mvbinary(Y, twoway = TRUE) # convert conditional probabilities into natural parameters (log-odds) theta <- ising_theta_from_cond_prob(pi) # convert natural parameters `theta` to square matrix form `Theta` Theta <- matrix(NA, q, q) Theta[lower.tri(diag(q), diag = TRUE)] <- theta Theta[upper.tri(diag(q))] <- t(Theta)[upper.tri(diag(q))] Theta <- (0.5 + diag(0.5, q, q)) * Theta # SVD of `Theta` ThetaSVD <- La.svd(Theta, min(p, q), 0) # Finally, initial `alpha` parameter estimate # `alpha_0 = U_Sigma D_Sigma^-1/2 D_Theta^1/2 U_Theta'` alpha <- with(list(S = SigmaSVD, T = ThetaSVD), { S$u %*% diag(sqrt(T$d[seq_len(min(p, q))] / S$d[seq_len(min(p, q))])) %*% t(T$u) }) ### Optimize log-likelihood for `alpha` tensorPredictors::NAGD( fun.loss = function(alpha) -ising_conditional_log_likelihood(alpha, X, Y), fun.grad = function(alpha) -ising_conditional_score(alpha, X, Y), params = alpha, ..., callback = callback ) } n <- 1000 p <- 7 q <- 9 alpha.true <- matrix(rnorm(p * q), p) X <- matrix(rnorm(n * p), n) theta <- function(alpha, x) { Theta <- crossprod(crossprod(x, alpha)) diag(Theta) <- 0.5 * diag(Theta) 2 * Theta[lower.tri(diag(ncol(alpha)), diag = TRUE)] } # sample Y ~ P( . | X = x) for x in X Y <- apply(X, 1, function(x) ising_sample(1, theta(alpha.true, x))) attr(Y, "p") <- as.integer(q) max.iter <- 100L ising_conditional_fit(X, Y, max.iter = max.iter, callback = function(iter, alpha) { cat(sprintf( "%4d/%4d - diff: %12.4f - ll: %12.4f\n", iter, max.iter, min(norm(alpha - alpha.true, "F"), norm(alpha + alpha.true, "F")), ising_conditional_log_likelihood(alpha, X, Y) )) }) ising_conditional_log_likelihood(alpha.true, X, Y) ising_conditional_log_likelihood.R(alpha.true, X, Y) for (. in 1:10) { print(ising_conditional_log_likelihood(matrix(rnorm(p * q), p, q), X, Y)) } YY <- as.mvbmatrix(Y) microbenchmark::microbenchmark( mean.mvbinary(Y, twoway = TRUE), rowMeans(apply(YY, 1, function(y) outer(y, y, `&`)))[lower.tri(diag(q), diag = TRUE)] ) par(mfrow = c(2, 2)) tensorPredictors::matrixImage(alpha) tensorPredictors::matrixImage(alpha.true) tensorPredictors::matrixImage(alpha) tensorPredictors::matrixImage(-alpha.true)