364 lines
11 KiB
R
364 lines
11 KiB
R
library(mvbernoulli)
|
|
|
|
printMVBinary <- function(Y) {
|
|
Y <- array(as.integer(Y), dim = dim(Y))
|
|
eventIndex <- seq_len(nrow(Y))
|
|
eventNr <- apply(Y, 1, function(y) sum(y * 2^(rev(seq_len(p)) - 1)))
|
|
dimnames(Y) <- list(
|
|
"Index/Event" = paste(eventIndex, eventNr, sep = "/"),
|
|
"Bit Index" = as.character(rev(seq_len(p)) - 1)
|
|
)
|
|
print.table(Y, zero.print = ".")
|
|
}
|
|
|
|
|
|
n <- 100
|
|
p <- 6
|
|
|
|
(theta <- rnorm(p * (p + 1) / 2))
|
|
|
|
pi <- ising_cond_probs(theta)
|
|
all.equal(
|
|
theta,
|
|
ising_theta_from_cond_prob(pi)
|
|
)
|
|
|
|
tensorPredictors::matrixImage({
|
|
Theta <- matrix(NA, p, p)
|
|
Theta[lower.tri(Theta, diag = TRUE)] <- theta
|
|
Theta[upper.tri(Theta)] <- t(Theta)[upper.tri(Theta)]
|
|
Theta
|
|
}, main = expression(paste("natural Params ", Theta)))
|
|
tensorPredictors::matrixImage({
|
|
PI <- matrix(NA, p, p)
|
|
PI[lower.tri(PI, diag = TRUE)] <- ising_cond_probs(theta)
|
|
PI[upper.tri(PI)] <- t(PI)[upper.tri(PI)]
|
|
PI
|
|
}, main = expression(paste("Conditional Probs. P(", Y[i], " = ", Y[j], " = 1", " | ", Y[-i - j], " = ", 0, ")")))
|
|
tensorPredictors::matrixImage({
|
|
MAR <- matrix(NA, p, p)
|
|
MAR[lower.tri(MAR, diag = TRUE)] <- ising_marginal_probs(theta)
|
|
MAR[upper.tri(MAR)] <- t(MAR)[upper.tri(MAR)]
|
|
MAR
|
|
}, main = expression(paste("Marginal Probs. P(", Y[i], " = ", Y[j], " = 1)")))
|
|
|
|
Y <- matrix(sample(c(TRUE, FALSE), n * p, replace = TRUE), n)
|
|
printMVBinary(Y)
|
|
|
|
|
|
allY <- function(p) {
|
|
events <- c(FALSE, TRUE)
|
|
for (. in seq_len(p - 1)) {
|
|
events <- rbind(
|
|
cbind(FALSE, events),
|
|
cbind( TRUE, events)
|
|
)
|
|
}
|
|
events
|
|
}
|
|
printMVBinary(allY(p))
|
|
|
|
G <- ising_score(theta, Y)
|
|
# Numeric gradiend
|
|
log.likelihood <- function(theta, Y) {
|
|
p <- ncol(Y)
|
|
# check sizes
|
|
stopifnot(p * (p + 1) == 2 * length(theta))
|
|
# and reverse column order
|
|
# this is needed as internally the left are the high bits (high index) and
|
|
# the right are the low bits (low index) which means for matching indices
|
|
# we need to reverse the column order
|
|
Y <- Y[, rev(seq_len(p)), drop = FALSE]
|
|
# calc scaling factor
|
|
sum_0 <- sum(exp(
|
|
theta %*% apply(allY(p), 1, function(y) outer(y, y, `&`))[lower.tri(diag(p), diag = TRUE), ]
|
|
))
|
|
# evaluate log likelihood
|
|
-log(sum_0) + mean(
|
|
theta %*% apply(Y, 1, function(y) outer(y, y, `&`))[lower.tri(diag(p), diag = TRUE), ]
|
|
)
|
|
}
|
|
|
|
G.num <- local({
|
|
h <- 1e-6
|
|
mapply(function(i) {
|
|
delta <- h * (seq_along(theta) == i)
|
|
(log.likelihood(theta + delta, Y) - log.likelihood(theta - delta, Y)) / (2 * h)
|
|
}, seq_along(theta))
|
|
})
|
|
|
|
data.frame(G, G.num)
|
|
|
|
|
|
for (n in c(2, 7, 12, 13, 14)) {
|
|
for (p in 1:4) {
|
|
cat(sprintf("%6d / %6d\n", sum(mapply(choose, n, 0:p)), nrSubSets(n, p)))
|
|
}
|
|
}
|
|
|
|
|
|
p <- 5
|
|
(A <- tcrossprod(apply(allY(p), 1, function(y) outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)])))
|
|
|
|
print.table(B <- ising_fisher_info(theta), zero.print = ".")
|
|
|
|
all.equal(A[lower.tri(A, TRUE)], B[lower.tri(B, TRUE)])
|
|
|
|
ising_fisher_info.R <- function(theta, p) {
|
|
stopifnot(2 * length(theta) == p * (p + 1))
|
|
|
|
Y <- allY(p)
|
|
|
|
# Ising model scaling factor for `P(Y = y) = p_0 exp(vech(y y')' theta)`
|
|
sum_0 <- sum(apply(Y, 1, function(y) {
|
|
vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
|
|
exp(sum(vechYY * theta))
|
|
}))
|
|
p_0 <- 1 / sum_0
|
|
|
|
# E[vech(Y Y')]
|
|
EvechYY <- p_0 * rowSums(apply(Y, 1, function(y) {
|
|
vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
|
|
exp(sum(vechYY * theta)) * vechYY
|
|
}))
|
|
|
|
# E[vech(Y Y') vech(Y Y')']
|
|
EvechYYvechYY <- p_0 * matrix(rowSums(apply(Y, 1, function(y) {
|
|
vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
|
|
exp(sum(vechYY * theta)) * outer(vechYY, vechYY)
|
|
})), p * (p + 1) / 2)
|
|
|
|
# Cov(vech(Y Y'), vech(Y Y')) = E[vech(Y Y') vech(Y Y')'] - E[vech(Y Y')] E[vech(Y Y')]'
|
|
EvechYYvechYY - outer(EvechYY, EvechYY)
|
|
}
|
|
|
|
all.equal(
|
|
ising_fisher_info.R(theta, p),
|
|
ising_fisher_info(theta)
|
|
)
|
|
|
|
p <- 10
|
|
theta <- rnorm(p * (p + 1) / 2)
|
|
microbenchmark::microbenchmark(
|
|
ising_fisher_info.R(theta, p),
|
|
ising_fisher_info(theta)
|
|
)
|
|
|
|
|
|
ising_fisher_scoring <- function(Y) {
|
|
# initial estimate (guess)
|
|
ltri <- which(lower.tri(diag(p), diag = TRUE))
|
|
theta <- ising_theta_from_cond_prob(rowMeans(apply(Y, 1, function(y) outer(y, y, `&`)[ltri])))
|
|
|
|
print(theta)
|
|
|
|
ll <- log.likelihood(theta, Y)
|
|
|
|
# iterate Fisher scoring
|
|
for (iter in 1:20) {
|
|
theta <- theta + solve(ising_fisher_info(theta), ising_score(theta, Y))
|
|
|
|
ll <- c(ll, log.likelihood(theta, Y))
|
|
|
|
cat("ll: ", tail(ll, 1), "\n")
|
|
}
|
|
|
|
theta
|
|
}
|
|
ising_fisher_scoring(Y)
|
|
|
|
|
|
|
|
microbenchmark::microbenchmark(
|
|
cov.mvbinary(Y), # double copy (TODO: change MVBinary conversion/SEXP binding)
|
|
cov(Y), # call the next expr. through default args
|
|
.Call(stats:::C_cov, Y, NULL, na.method = 4L, FALSE)
|
|
)
|
|
|
|
|
|
################################################################################
|
|
### Conditional Ising Model ###
|
|
################################################################################
|
|
n <- 1000
|
|
p <- 10
|
|
q <- 10
|
|
|
|
alpha <- matrix(rnorm(p * q), p)
|
|
X <- matrix(rnorm(n * p), n)
|
|
theta <- function(alpha, x) {
|
|
Theta <- crossprod(crossprod(x, alpha))
|
|
diag(Theta) <- 0.5 * diag(Theta)
|
|
2 * Theta[lower.tri(diag(ncol(alpha)), diag = TRUE)]
|
|
}
|
|
# sample Y ~ P( . | X = x) for x in X
|
|
system.time(Y <- apply(X, 1, function(x) ising_sample(1, theta(alpha, x))))
|
|
attr(Y, "p") <- as.integer(q)
|
|
class(Y) <- "mvbinary"
|
|
|
|
# For the numeric gradient comparison
|
|
allY <- function(p) {
|
|
events <- c(FALSE, TRUE)
|
|
for (. in seq_len(p - 1)) {
|
|
events <- rbind(
|
|
cbind(FALSE, events),
|
|
cbind( TRUE, events)
|
|
)
|
|
}
|
|
events
|
|
}
|
|
ising_conditional_log_likelihood.R <- function(alpha, X, Y) {
|
|
# convert Y to a binary matrix
|
|
Y <- as.mvbmatrix(Y)
|
|
#retrieve dimensions
|
|
n <- nrow(X)
|
|
p <- ncol(X)
|
|
q <- ncol(Y)
|
|
# check dimensions
|
|
stopifnot({
|
|
nrow(Y) == n
|
|
all(dim(alpha) == c(p, q))
|
|
})
|
|
|
|
# setup reused internal variables
|
|
vech_index <- which(lower.tri(diag(q), diag = TRUE))
|
|
aaY <- apply(allY(q), 1, function(y) outer(y, y, `&`))[vech_index, ]
|
|
|
|
# sum over all observations
|
|
ll <- 0
|
|
for (i in seq_len(n)) {
|
|
# Theta = alpha' x x' alpha
|
|
Theta <- crossprod(crossprod(X[i, ], alpha))
|
|
# theta = vech((2 1_q 1_q' - I_q) o Theta)
|
|
theta <- ((2 - diag(q)) * Theta)[vech_index]
|
|
|
|
# scaling factor `p_0^-1 = sum_y exp(vech(y y')' theta)`
|
|
sum_0 <- sum(exp(theta %*% aaY))
|
|
|
|
print(log(sum_0))
|
|
|
|
# evaluate log likelihood
|
|
ll <- ll + sum(theta * outer(Y[i, ], Y[i, ], `&`)[vech_index]) - log(sum_0)
|
|
}
|
|
|
|
ll / n
|
|
}
|
|
# numeric gradiend (score of the log-likelihood)
|
|
ising_conditional_score.R <- function(alpha, X, Y, h = 1e-6) {
|
|
matrix(mapply(function(i) {
|
|
delta <- h * (seq_along(alpha) == i)
|
|
(ising_conditional_log_likelihood.R(alpha + delta, X, Y) -
|
|
ising_conditional_log_likelihood.R(alpha - delta, X, Y)) / (2 * h)
|
|
}, seq_along(alpha)), nrow(alpha))
|
|
}
|
|
|
|
stopifnot(all.equal(
|
|
ising_conditional_log_likelihood.R(alpha, X, Y),
|
|
ising_conditional_log_likelihood(alpha, X, Y)
|
|
))
|
|
microbenchmark::microbenchmark(
|
|
ising_conditional_log_likelihood.R(alpha, X, Y),
|
|
ising_conditional_log_likelihood(alpha, X, Y)
|
|
)
|
|
|
|
stopifnot(all.equal(
|
|
ising_conditional_score.R(alpha, X, Y),
|
|
ising_conditional_score(alpha, X, Y)
|
|
))
|
|
microbenchmark::microbenchmark(
|
|
ising_conditional_score.R(alpha, X, Y),
|
|
ising_conditional_score(alpha, X, Y)
|
|
)
|
|
|
|
################################################################################
|
|
### Fit Conditional Ising Model ###
|
|
################################################################################
|
|
|
|
ising_conditional_fit <- function(X, Y, ..., callback = NULL) {
|
|
# get and check dimensions
|
|
n <- if (is.null(nrow(Y))) length(Y) else nrow(Y)
|
|
p <- ncol(X)
|
|
q <- if (is.null(ncol(Y))) attr(Y, "p") else ncol(Y)
|
|
# check dimensions
|
|
stopifnot(nrow(X) == n)
|
|
|
|
### Initial value estimate
|
|
# SVD of the predictor covariance estimate `Sigma = U_Sigma D_Sigma U_Sigma'`
|
|
SigmaSVD <- La.svd(cov(X), min(p, q), 0)
|
|
|
|
# Estimate `pi` as the single and two way effect means (approx conditional
|
|
# probabilities through the marginal probability estimate)
|
|
pi <- mean.mvbinary(Y, twoway = TRUE)
|
|
|
|
# convert conditional probabilities into natural parameters (log-odds)
|
|
theta <- ising_theta_from_cond_prob(pi)
|
|
|
|
# convert natural parameters `theta` to square matrix form `Theta`
|
|
Theta <- matrix(NA, q, q)
|
|
Theta[lower.tri(diag(q), diag = TRUE)] <- theta
|
|
Theta[upper.tri(diag(q))] <- t(Theta)[upper.tri(diag(q))]
|
|
Theta <- (0.5 + diag(0.5, q, q)) * Theta
|
|
|
|
# SVD of `Theta`
|
|
ThetaSVD <- La.svd(Theta, min(p, q), 0)
|
|
|
|
# Finally, initial `alpha` parameter estimate
|
|
# `alpha_0 = U_Sigma D_Sigma^-1/2 D_Theta^1/2 U_Theta'`
|
|
alpha <- with(list(S = SigmaSVD, T = ThetaSVD), {
|
|
S$u %*% diag(sqrt(T$d[seq_len(min(p, q))] / S$d[seq_len(min(p, q))])) %*% t(T$u)
|
|
})
|
|
|
|
### Optimize log-likelihood for `alpha`
|
|
tensorPredictors::NAGD(
|
|
fun.loss = function(alpha) -ising_conditional_log_likelihood(alpha, X, Y),
|
|
fun.grad = function(alpha) -ising_conditional_score(alpha, X, Y),
|
|
params = alpha,
|
|
...,
|
|
callback = callback
|
|
)
|
|
}
|
|
|
|
n <- 1000
|
|
p <- 7
|
|
q <- 9
|
|
|
|
alpha.true <- matrix(rnorm(p * q), p)
|
|
X <- matrix(rnorm(n * p), n)
|
|
theta <- function(alpha, x) {
|
|
Theta <- crossprod(crossprod(x, alpha))
|
|
diag(Theta) <- 0.5 * diag(Theta)
|
|
2 * Theta[lower.tri(diag(ncol(alpha)), diag = TRUE)]
|
|
}
|
|
# sample Y ~ P( . | X = x) for x in X
|
|
Y <- apply(X, 1, function(x) ising_sample(1, theta(alpha.true, x)))
|
|
attr(Y, "p") <- as.integer(q)
|
|
|
|
max.iter <- 100L
|
|
ising_conditional_fit(X, Y, max.iter = max.iter, callback = function(iter, alpha) {
|
|
cat(sprintf(
|
|
"%4d/%4d - diff: %12.4f - ll: %12.4f\n",
|
|
iter, max.iter,
|
|
min(norm(alpha - alpha.true, "F"), norm(alpha + alpha.true, "F")),
|
|
ising_conditional_log_likelihood(alpha, X, Y)
|
|
))
|
|
})
|
|
|
|
|
|
ising_conditional_log_likelihood(alpha.true, X, Y)
|
|
ising_conditional_log_likelihood.R(alpha.true, X, Y)
|
|
|
|
for (. in 1:10) {
|
|
print(ising_conditional_log_likelihood(matrix(rnorm(p * q), p, q), X, Y))
|
|
}
|
|
|
|
YY <- as.mvbmatrix(Y)
|
|
microbenchmark::microbenchmark(
|
|
mean.mvbinary(Y, twoway = TRUE),
|
|
rowMeans(apply(YY, 1, function(y) outer(y, y, `&`)))[lower.tri(diag(q), diag = TRUE)]
|
|
)
|
|
|
|
par(mfrow = c(2, 2))
|
|
tensorPredictors::matrixImage(alpha)
|
|
tensorPredictors::matrixImage(alpha.true)
|
|
tensorPredictors::matrixImage(alpha)
|
|
tensorPredictors::matrixImage(-alpha.true)
|