tensor_predictors/mvbernoulli/inst/examples/ising_grad.R

364 lines
11 KiB
R
Raw Normal View History

library(mvbernoulli)
printMVBinary <- function(Y) {
Y <- array(as.integer(Y), dim = dim(Y))
eventIndex <- seq_len(nrow(Y))
eventNr <- apply(Y, 1, function(y) sum(y * 2^(rev(seq_len(p)) - 1)))
dimnames(Y) <- list(
"Index/Event" = paste(eventIndex, eventNr, sep = "/"),
"Bit Index" = as.character(rev(seq_len(p)) - 1)
)
print.table(Y, zero.print = ".")
}
n <- 100
p <- 6
(theta <- rnorm(p * (p + 1) / 2))
pi <- ising_cond_probs(theta)
all.equal(
theta,
ising_theta_from_cond_prob(pi)
)
tensorPredictors::matrixImage({
Theta <- matrix(NA, p, p)
Theta[lower.tri(Theta, diag = TRUE)] <- theta
Theta[upper.tri(Theta)] <- t(Theta)[upper.tri(Theta)]
Theta
}, main = expression(paste("natural Params ", Theta)))
tensorPredictors::matrixImage({
PI <- matrix(NA, p, p)
PI[lower.tri(PI, diag = TRUE)] <- ising_cond_probs(theta)
PI[upper.tri(PI)] <- t(PI)[upper.tri(PI)]
PI
}, main = expression(paste("Conditional Probs. P(", Y[i], " = ", Y[j], " = 1", " | ", Y[-i - j], " = ", 0, ")")))
tensorPredictors::matrixImage({
MAR <- matrix(NA, p, p)
MAR[lower.tri(MAR, diag = TRUE)] <- ising_marginal_probs(theta)
MAR[upper.tri(MAR)] <- t(MAR)[upper.tri(MAR)]
MAR
}, main = expression(paste("Marginal Probs. P(", Y[i], " = ", Y[j], " = 1)")))
Y <- matrix(sample(c(TRUE, FALSE), n * p, replace = TRUE), n)
printMVBinary(Y)
allY <- function(p) {
events <- c(FALSE, TRUE)
for (. in seq_len(p - 1)) {
events <- rbind(
cbind(FALSE, events),
cbind( TRUE, events)
)
}
events
}
printMVBinary(allY(p))
G <- ising_score(theta, Y)
# Numeric gradiend
log.likelihood <- function(theta, Y) {
p <- ncol(Y)
# check sizes
stopifnot(p * (p + 1) == 2 * length(theta))
# and reverse column order
# this is needed as internally the left are the high bits (high index) and
# the right are the low bits (low index) which means for matching indices
# we need to reverse the column order
Y <- Y[, rev(seq_len(p)), drop = FALSE]
# calc scaling factor
sum_0 <- sum(exp(
theta %*% apply(allY(p), 1, function(y) outer(y, y, `&`))[lower.tri(diag(p), diag = TRUE), ]
))
# evaluate log likelihood
-log(sum_0) + mean(
theta %*% apply(Y, 1, function(y) outer(y, y, `&`))[lower.tri(diag(p), diag = TRUE), ]
)
}
G.num <- local({
h <- 1e-6
mapply(function(i) {
delta <- h * (seq_along(theta) == i)
(log.likelihood(theta + delta, Y) - log.likelihood(theta - delta, Y)) / (2 * h)
}, seq_along(theta))
})
data.frame(G, G.num)
for (n in c(2, 7, 12, 13, 14)) {
for (p in 1:4) {
cat(sprintf("%6d / %6d\n", sum(mapply(choose, n, 0:p)), nrSubSets(n, p)))
}
}
p <- 5
(A <- tcrossprod(apply(allY(p), 1, function(y) outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)])))
print.table(B <- ising_fisher_info(theta), zero.print = ".")
all.equal(A[lower.tri(A, TRUE)], B[lower.tri(B, TRUE)])
ising_fisher_info.R <- function(theta, p) {
stopifnot(2 * length(theta) == p * (p + 1))
Y <- allY(p)
# Ising model scaling factor for `P(Y = y) = p_0 exp(vech(y y')' theta)`
sum_0 <- sum(apply(Y, 1, function(y) {
vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
exp(sum(vechYY * theta))
}))
p_0 <- 1 / sum_0
# E[vech(Y Y')]
EvechYY <- p_0 * rowSums(apply(Y, 1, function(y) {
vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
exp(sum(vechYY * theta)) * vechYY
}))
# E[vech(Y Y') vech(Y Y')']
EvechYYvechYY <- p_0 * matrix(rowSums(apply(Y, 1, function(y) {
vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
exp(sum(vechYY * theta)) * outer(vechYY, vechYY)
})), p * (p + 1) / 2)
# Cov(vech(Y Y'), vech(Y Y')) = E[vech(Y Y') vech(Y Y')'] - E[vech(Y Y')] E[vech(Y Y')]'
EvechYYvechYY - outer(EvechYY, EvechYY)
}
all.equal(
ising_fisher_info.R(theta, p),
ising_fisher_info(theta)
)
p <- 10
theta <- rnorm(p * (p + 1) / 2)
microbenchmark::microbenchmark(
ising_fisher_info.R(theta, p),
ising_fisher_info(theta)
)
ising_fisher_scoring <- function(Y) {
# initial estimate (guess)
ltri <- which(lower.tri(diag(p), diag = TRUE))
theta <- ising_theta_from_cond_prob(rowMeans(apply(Y, 1, function(y) outer(y, y, `&`)[ltri])))
print(theta)
ll <- log.likelihood(theta, Y)
# iterate Fisher scoring
for (iter in 1:20) {
theta <- theta + solve(ising_fisher_info(theta), ising_score(theta, Y))
ll <- c(ll, log.likelihood(theta, Y))
cat("ll: ", tail(ll, 1), "\n")
}
theta
}
ising_fisher_scoring(Y)
microbenchmark::microbenchmark(
cov.mvbinary(Y), # double copy (TODO: change MVBinary conversion/SEXP binding)
cov(Y), # call the next expr. through default args
.Call(stats:::C_cov, Y, NULL, na.method = 4L, FALSE)
)
################################################################################
### Conditional Ising Model ###
################################################################################
n <- 1000
p <- 10
q <- 10
alpha <- matrix(rnorm(p * q), p)
X <- matrix(rnorm(n * p), n)
theta <- function(alpha, x) {
Theta <- crossprod(crossprod(x, alpha))
diag(Theta) <- 0.5 * diag(Theta)
2 * Theta[lower.tri(diag(ncol(alpha)), diag = TRUE)]
}
# sample Y ~ P( . | X = x) for x in X
system.time(Y <- apply(X, 1, function(x) ising_sample(1, theta(alpha, x))))
attr(Y, "p") <- as.integer(q)
class(Y) <- "mvbinary"
# For the numeric gradient comparison
allY <- function(p) {
events <- c(FALSE, TRUE)
for (. in seq_len(p - 1)) {
events <- rbind(
cbind(FALSE, events),
cbind( TRUE, events)
)
}
events
}
ising_conditional_log_likelihood.R <- function(alpha, X, Y) {
# convert Y to a binary matrix
Y <- as.mvbmatrix(Y)
#retrieve dimensions
n <- nrow(X)
p <- ncol(X)
q <- ncol(Y)
# check dimensions
stopifnot({
nrow(Y) == n
all(dim(alpha) == c(p, q))
})
# setup reused internal variables
vech_index <- which(lower.tri(diag(q), diag = TRUE))
aaY <- apply(allY(q), 1, function(y) outer(y, y, `&`))[vech_index, ]
# sum over all observations
ll <- 0
for (i in seq_len(n)) {
# Theta = alpha' x x' alpha
Theta <- crossprod(crossprod(X[i, ], alpha))
# theta = vech((2 1_q 1_q' - I_q) o Theta)
theta <- ((2 - diag(q)) * Theta)[vech_index]
# scaling factor `p_0^-1 = sum_y exp(vech(y y')' theta)`
sum_0 <- sum(exp(theta %*% aaY))
print(log(sum_0))
# evaluate log likelihood
ll <- ll + sum(theta * outer(Y[i, ], Y[i, ], `&`)[vech_index]) - log(sum_0)
}
ll / n
}
# numeric gradiend (score of the log-likelihood)
ising_conditional_score.R <- function(alpha, X, Y, h = 1e-6) {
matrix(mapply(function(i) {
delta <- h * (seq_along(alpha) == i)
(ising_conditional_log_likelihood.R(alpha + delta, X, Y) -
ising_conditional_log_likelihood.R(alpha - delta, X, Y)) / (2 * h)
}, seq_along(alpha)), nrow(alpha))
}
stopifnot(all.equal(
ising_conditional_log_likelihood.R(alpha, X, Y),
ising_conditional_log_likelihood(alpha, X, Y)
))
microbenchmark::microbenchmark(
ising_conditional_log_likelihood.R(alpha, X, Y),
ising_conditional_log_likelihood(alpha, X, Y)
)
stopifnot(all.equal(
ising_conditional_score.R(alpha, X, Y),
ising_conditional_score(alpha, X, Y)
))
microbenchmark::microbenchmark(
ising_conditional_score.R(alpha, X, Y),
ising_conditional_score(alpha, X, Y)
)
################################################################################
### Fit Conditional Ising Model ###
################################################################################
ising_conditional_fit <- function(X, Y, ..., callback = NULL) {
# get and check dimensions
n <- if (is.null(nrow(Y))) length(Y) else nrow(Y)
p <- ncol(X)
q <- if (is.null(ncol(Y))) attr(Y, "p") else ncol(Y)
# check dimensions
stopifnot(nrow(X) == n)
### Initial value estimate
# SVD of the predictor covariance estimate `Sigma = U_Sigma D_Sigma U_Sigma'`
SigmaSVD <- La.svd(cov(X), min(p, q), 0)
# Estimate `pi` as the single and two way effect means (approx conditional
# probabilities through the marginal probability estimate)
pi <- mean.mvbinary(Y, twoway = TRUE)
# convert conditional probabilities into natural parameters (log-odds)
theta <- ising_theta_from_cond_prob(pi)
# convert natural parameters `theta` to square matrix form `Theta`
Theta <- matrix(NA, q, q)
Theta[lower.tri(diag(q), diag = TRUE)] <- theta
Theta[upper.tri(diag(q))] <- t(Theta)[upper.tri(diag(q))]
Theta <- (0.5 + diag(0.5, q, q)) * Theta
# SVD of `Theta`
ThetaSVD <- La.svd(Theta, min(p, q), 0)
# Finally, initial `alpha` parameter estimate
# `alpha_0 = U_Sigma D_Sigma^-1/2 D_Theta^1/2 U_Theta'`
alpha <- with(list(S = SigmaSVD, T = ThetaSVD), {
S$u %*% diag(sqrt(T$d[seq_len(min(p, q))] / S$d[seq_len(min(p, q))])) %*% t(T$u)
})
### Optimize log-likelihood for `alpha`
tensorPredictors::NAGD(
fun.loss = function(alpha) -ising_conditional_log_likelihood(alpha, X, Y),
fun.grad = function(alpha) -ising_conditional_score(alpha, X, Y),
params = alpha,
...,
callback = callback
)
}
n <- 1000
p <- 7
q <- 9
alpha.true <- matrix(rnorm(p * q), p)
X <- matrix(rnorm(n * p), n)
theta <- function(alpha, x) {
Theta <- crossprod(crossprod(x, alpha))
diag(Theta) <- 0.5 * diag(Theta)
2 * Theta[lower.tri(diag(ncol(alpha)), diag = TRUE)]
}
# sample Y ~ P( . | X = x) for x in X
Y <- apply(X, 1, function(x) ising_sample(1, theta(alpha.true, x)))
attr(Y, "p") <- as.integer(q)
max.iter <- 100L
ising_conditional_fit(X, Y, max.iter = max.iter, callback = function(iter, alpha) {
cat(sprintf(
"%4d/%4d - diff: %12.4f - ll: %12.4f\n",
iter, max.iter,
min(norm(alpha - alpha.true, "F"), norm(alpha + alpha.true, "F")),
ising_conditional_log_likelihood(alpha, X, Y)
))
})
ising_conditional_log_likelihood(alpha.true, X, Y)
ising_conditional_log_likelihood.R(alpha.true, X, Y)
for (. in 1:10) {
print(ising_conditional_log_likelihood(matrix(rnorm(p * q), p, q), X, Y))
}
YY <- as.mvbmatrix(Y)
microbenchmark::microbenchmark(
mean.mvbinary(Y, twoway = TRUE),
rowMeans(apply(YY, 1, function(y) outer(y, y, `&`)))[lower.tri(diag(q), diag = TRUE)]
)
par(mfrow = c(2, 2))
tensorPredictors::matrixImage(alpha)
tensorPredictors::matrixImage(alpha.true)
tensorPredictors::matrixImage(alpha)
tensorPredictors::matrixImage(-alpha.true)