#' Specialized version of the GMLM for the Ising model (inverse Ising problem) #' #' @todo TODO: Add beta and Omega projections #' #' @export gmlm_ising <- function(X, F, sample.axis = length(dim(X)), # proj.betas = ..., proj.Omegas = ..., # TODO: this max.iter = 1000L, eps = sqrt(.Machine$double.eps), step.size = 1e-3, zig.zag.threashold = 20L, patience = 3L, nr.slices = 20L, # only for univariate `F(y) = y` slice.method = c("cut", "ecdf", "none"), # only for univariate `F(y) = y` and `y` is a factor or integer logger = function(...) { } ) { # Get problem dimensions dimX <- dim(X)[-sample.axis] # threat scalar `F` as a tensor if (is.null(dim(F))) { dimF <- rep(1L, length(dimX)) dim(F) <- ifelse(seq_along(dim(X)) == sample.axis, sample.size, 1L) } else { dimF <- dim(F)[-sample.axis] } sample.size <- dim(X)[sample.axis] # rearrange `X`, `F` such that the last axis enumerates observations if (sample.axis != length(dim(X))) { axis.perm <- c(seq_along(dim(X))[-sample.axis], sample.axis) X <- aperm(X, axis.perm) F <- aperm(F, axis.perm) sample.axis <- length(dim(X)) } modes <- seq_along(dimX) # Special case for univariate response `vec F(y) = y` # Due to high computational costs we use slicing slice.method <- match.arg(slice.method) slices.ind <- if ((slice.method != "none") && (length(F) == prod(dim(F)))) { y <- as.vector(F) if (!(is.factor(y) || is.integer(y))) { slice.method <- match.arg(slice.method) if (slice.method == "ecdf") { y <- cut(ecdf(y)(y), nr.slices) } else { y <- cut(y, nr.slices) } } split(seq_len(sample.size), y, drop = TRUE) } else { seq_len(sample.size) } # initialize betas with tensor normal estimate (ignoring data being binary) fit_normal <- gmlm_tensor_normal(X, F, sample.axis = length(dim(X))) betas <- fit_normal$betas Omegas <- Omegas.init <- Map(function(mode) { n <- prod(dim(X)[-mode]) prob2 <- mcrossprod(X, mode = mode) / n prob2[prob2 == 0] <- 1 / n prob1 <- diag(prob2) `prob1^2` <- outer(prob1, prob1) `diag<-`(log(((1 - `prob1^2`) / `prob1^2`) * prob2 / (1 - prob2)), 0) }, modes) # Determin degenerate combinations, that are variables which are exclusive # in the data set matX <- mat(X, sample.axis) degen <- crossprod(matX) == 0 degen.mask <- which(degen) # If there are degenerate combination, compute an (arbitrary) bound the # log odds parameters of those combinations if (any(degen.mask)) { degen.ind <- arrayInd(degen.mask, dim(degen)) meanX <- colMeans(matX) prodX <- meanX[degen.ind[, 1]] * meanX[degen.ind[, 2]] degen.bounds <- log((1 - prodX) / (prodX * sample.size)) # Component indices in Omegas of degenerate two-way interactions degen.ind <- arrayInd(degen.mask, rep(dimX, 2)) degen.ind <- Map(function(d, m) { degen.ind[, m] + dimX[m] * (degen.ind[, m + length(dimX)] - 1L) }, dimX, seq_along(dimX)) ## Enforce initial value degeneracy interaction param. constraints # Extract parameters corresponding to degenerate interactions degen.params <- do.call(rbind, Map(`[`, Omegas, degen.ind)) # Degeneracy Constrained Parameters (sign is dropped) DCP <- mapply(function(vals, bound) { logVals <- log(abs(vals)) err <- max(0, sum(logVals) - log(abs(bound))) exp(logVals - (err / length(vals))) }, split(degen.params, col(degen.params)), degen.bounds) # Update values in Omegas such that all degeneracy constraints hold Omegas <- Map(function(Omega, cp, ind) { # Combine multiple constraints for every element into single # constraint value per element cp <- mapply(min, split(abs(cp), ind)) ind <- as.integer(names(cp)) `[<-`(Omega, ind, sign(Omega[ind]) * cp) }, Omegas, split(DCP, row(DCP)), degen.ind) } # Initialize mean squared gradients grad2_betas <- Map(array, 0, Map(dim, betas)) grad2_Omegas <- Map(array, 0, Map(dim, Omegas)) # Keep track of the last loss to accumulate loss difference sign changes # indicating optimization instabilities as a sign to stop last_loss <- Inf accum_sign <- 1 # non improving iteration counter non_improving <- 0L # technical access points to dynamicaly access a multi-dimensional array `X[..., i]` <- slice.expr(X, sample.axis, index = i, drop = FALSE) `F[..., i]` <- slice.expr(F, sample.axis, index = i, drop = FALSE) # Iterate till a break condition triggers or till max. nr. of iterations for (iter in seq_len(max.iter)) { grad_betas <- Map(matrix, 0, dimX, dimF) Omega <- Reduce(kronecker, rev(Omegas)) # second order residuals accumulator # `sum_i (X_i o X_i - E[X o X | Y = y_i])` R2 <- array(0, dim = c(dimX, dimX)) # negative log-likelihood loss <- 0 for (i in slices.ind) { # slice size (nr. of objects in the slice) n_i <- length(i) sumF_i <- rowSums(eval(`F[..., i]`), dims = length(dimF)) diag_params_i <- mlm(sumF_i / n_i, betas) params_i <- Omega + diag(as.vector(diag_params_i)) m2_i <- ising_m2(params_i) # accumulate loss matX_i <- mat(eval(`X[..., i]`), modes) loss <- loss - ( sum(matX_i * (params_i %*% matX_i)) + n_i * log(attr(m2_i, "prob_0")) ) R2_i <- tcrossprod(matX_i) - n_i * m2_i R1_i <- diag(R2_i) dim(R1_i) <- dimX for (j in modes) { grad_betas[[j]] <- grad_betas[[j]] + mcrossprod(R1_i, mlm(sumF_i, betas[-j], modes[-j]), j) } R2 <- R2 + as.vector(R2_i) } grad_Omegas <- Map(function(j) { grad <- mlm(kronperm(R2), Map(as.vector, Omegas[-j]), modes[-j], transposed = TRUE) dim(grad) <- dim(Omegas[[j]]) grad }, modes) # update optimization behavioral trackers accum_sign <- sign(last_loss - loss) - accum_sign non_improving <- max(0L, non_improving - 1L + 2L * (last_loss < loss)) # check break conditions if (abs(accum_sign) > zig.zag.threashold) { break } if (non_improving > patience) { break } if (abs(last_loss - loss) < eps * last_loss) { break } # store current loss for the next iteration last_loss <- loss # Accumulate root mean squared gradiends grad2_betas <- Map(function(g2, g) 0.9 * g2 + 0.1 * (g * g), grad2_betas, grad_betas) grad2_Omegas <- Map(function(g2, g) 0.9 * g2 + 0.1 * (g * g), grad2_Omegas, grad_Omegas) # logging (before parameter update) logger(iter, loss, betas, Omegas, grad_betas, grad_Omegas) # Update Parameters betas <- Map(function(beta, grad, m2) { beta + (step.size / (sqrt(m2) + eps)) * grad }, betas, grad_betas, grad2_betas) Omegas <- Map(function(Omega, grad, m2) { Omega + (step.size / (sqrt(m2) + eps)) * grad }, Omegas, grad_Omegas, grad2_Omegas) # Enforce degeneracy parameter constraints if (any(degen.mask)) { # Extract parameters corresponding to degenerate interactions degen.params <- do.call(rbind, Map(`[`, Omegas, degen.ind)) # Degeneracy Constrained Parameters (sign is dropped) DCP <- mapply(function(vals, bound) { logVals <- log(abs(vals)) err <- max(0, sum(logVals) - log(abs(bound))) exp(logVals - (err / length(vals))) }, split(degen.params, col(degen.params)), degen.bounds) # Update values in Omegas such that all degeneracy constraints hold Omegas <- Map(function(Omega, cp, ind) { # Combine multiple constraints for every element into single # constraint value per element cp <- mapply(min, split(abs(cp), ind)) ind <- as.integer(names(cp)) `[<-`(Omega, ind, sign(Omega[ind]) * cp) }, Omegas, split(DCP, row(DCP)), degen.ind) } } structure( list(eta1 = array(0, dimX), betas = betas, Omegas = Omegas), tensor_normal = fit_normal, Omegas.init = Omegas.init, degen.mask = degen.mask ) } ################################################################################ ### Development Interactive Block (Delete / Make sim / TODO: ...) ### ################################################################################ if (FALSE) { # interactive() par(bg = "#1d1d1d", fg = "lightgray", col = "#d5d5d5", col.axis = "#d5d5d5", col.lab = "#d5d5d5", col.main = "#d5d5d5", col.sub = "#d5d5d5", # col.sub = "#2467d0" pch = 16 ) cex <- 1.25 col <- colorRampPalette(c("#f15050", "#1d1d1d", "#567DCA"))(256) .logger <- function() { iter <- 0L assign("log", data.frame( iter = rep(NA_integer_, 100000), loss = rep(NA_real_, 100000), dist.B = rep(NA_real_, 100000), dist.Omega = rep(NA_real_, 100000), norm.grad.B = rep(NA_real_, 100000), norm.grad.Omega = rep(NA_real_, 100000) ), envir = .GlobalEnv) assign("B.gmlm", NULL, .GlobalEnv) assign("Omega.gmlm", NULL, .GlobalEnv) function(it, loss, betas, Omegas, grad_betas, grad_Omegas) { # Store in global namespace (allows to stop and get the results) B.gmlm <- Reduce(kronecker, rev(betas)) assign("B.gmlm", B.gmlm, .GlobalEnv) Omega.gmlm <- Reduce(kronecker, rev(Omegas)) assign("Omega.gmlm", Omega.gmlm, .GlobalEnv) dist.B <- dist.subspace(B.true, B.gmlm, normalize = TRUE) dist.Omega <- norm(Omega.true - Omega.gmlm, "F") norm.grad.B <- sqrt(sum(mapply(norm, grad_betas, "F")^2)) norm.grad.Omega <- sqrt(sum(mapply(norm, grad_Omegas, "F")^2)) log[iter <<- iter + 1L, ] <<- list( it, loss, dist.B, dist.Omega, norm.grad.B, norm.grad.Omega ) cat(sprintf("\r%3d - d(B): %.3f, d(O): %.3f, |g(B)|: %.3f, |g(O)|: %.3f, loss: %.3f\033[K", it, dist.B, dist.Omega, norm.grad.B, norm.grad.Omega, loss)) } } sample.size <- 1000 dimX <- c(2, 3) # predictor `X` dimension dimF <- rep(1, length(dimX)) # "function" `F(y)` of responce `y` dimension betas <- Map(diag, 1, dimX, dimF) Omegas <- list(toeplitz(c(0, -2)), toeplitz(seq(1, 0, by = -0.5))) B.true <- Reduce(kronecker, rev(betas)) Omega.true <- Reduce(kronecker, rev(Omegas)) # data sampling routine c(X, F, y, sample.axis) %<-% (sample.data <- function(sample.size, betas, Omegas) { dimX <- mapply(nrow, betas) dimF <- mapply(ncol, betas) # generate response (sample axis is last axis) y <- runif(prod(sample.size, dimF), -2, 2) F <- array(y, dim = c(dimF, sample.size)) # ~ U[-1, 1] Omega <- Reduce(kronecker, rev(Omegas)) X <- apply(F, length(dim(F)), function(Fi) { dim(Fi) <- dimF params <- diag(as.vector(mlm(Fi, betas))) + Omega tensorPredictors::ising_sample(1, params) }) dim(X) <- c(dimX, sample.size) list(X = X, F = F, y = y, sample.axis = length(dim(X))) })(sample.size, betas, Omegas) local({ X.proto <- array(seq_len(prod(dimX)), dimX) interactions <- crossprod(mat(X, sample.axis)) dimnames(interactions) <- rep(list( do.call(paste0, c("X", Map(slice.index, list(X.proto), seq_along(dimX)))) ), 2) cat("Sample Size: ", sample.size, "\n") print.table(interactions, zero.print = ".") }) # system.time({ # fit.gmlm <- gmlm_ising(X, y, logger = .logger()) # }) Rprof() gmlm_ising(X, y) Rprof(NULL) summaryRprof() B.gmlm <- Reduce(kronecker, rev(fit.gmlm$betas)) Omega.gmlm <- Reduce(kronecker, rev(fit.gmlm$Omegas)) B.normal <- Reduce(kronecker, rev(attr(fit.gmlm, "tensor_normal")$betas)) Omega.init <- Reduce(kronecker, rev(attr(fit.gmlm, "Omegas.init"))) degen.mask <- attr(fit.gmlm, "degen.mask") local({ layout(matrix(c( 1, 2, 3, 3, 3, 1, 4, 5, 6, 7 ), nrow = 2, byrow = TRUE), width = c(6, 3, 1, 1, 1)) with(na.omit(log), { plot(range(iter), c(0, 1), type = "n", bty = "n", xlab = "Iterations", ylab = "Distance") lines(iter, dist.B, col = "red", lwd = 2) lines(iter, dist.Omega / max(dist.Omega), col = "blue", lwd = 2) lines(iter, (loss - min(loss)) / diff(range(loss)), col = "darkgreen", lwd = 2) norm.grad <- sqrt(norm.grad.B^2 + norm.grad.Omega^2) # Scale all gradient norms norm.grad.B <- norm.grad.B / max(norm.grad) norm.grad.Omega <- norm.grad.Omega / max(norm.grad) norm.grad <- norm.grad / max(norm.grad) lines(iter, norm.grad.B, lty = 2, col = "red") lines(iter, norm.grad.Omega, lty = 2, col = "blue") lines(iter, norm.grad, lty = 2, col = "darkgreen") axis(4, at = c( tail(dist.B, 1), min(dist.B) ), labels = round(c( tail(dist.B, 1), min(dist.B) ), 2), col = NA, col.ticks = "red", las = 1) axis(4, at = c( 1, tail(dist.Omega, 1) / max(dist.Omega), min(dist.Omega) / max(dist.Omega) ), labels = round(c( max(dist.Omega), tail(dist.Omega, 1), min(dist.Omega) ), 2), col = NA, col.ticks = "blue", las = 1) abline(h = c(tail(dist.B, 1), min(dist.B)), lty = "dotted", col = "red") abline(h = c(max(dist.Omega), tail(dist.Omega, 1), min(dist.Omega)) / max(dist.Omega), lty = "dotted", col = "blue") }) legend("topright", col = c("red", "blue", "darkgreen"), lty = 1, lwd = 2, legend = c("dist.B", "dist.Omega", "loss"), bty = "n") zlim <- max(abs(range(Omega.true, Omega.init, Omega.gmlm))) * c(-1, 1) matrixImage(Omega.true, main = "true", zlim = zlim, add.values = TRUE, col = col, cex = cex) matrixImage(round(Omega.init, 2), main = "init (cond. prob.)", zlim = zlim, add.values = TRUE, col = col, cex = cex) mtext(round(norm(Omega.true - Omega.init, "F"), 3), 3) matrixImage(round(Omega.gmlm, 2), main = "gmlm (ising)", zlim = zlim, add.values = TRUE, col = col, cex = cex, col.values = c(par("col"), "red")[`[<-`(array(1, rep(prod(dim(X)[-sample.axis]), 2)), degen.mask, 2)]) mtext(round(norm(Omega.true - Omega.gmlm, "F"), 3), 3) zlim <- max(abs(range(B.true, B.normal, B.gmlm))) * c(-1, 1) matrixImage(B.true, main = "true", zlim = zlim, add.values = TRUE, col = col, cex = cex) matrixImage(round(B.normal, 2), main = "init (normal)", zlim = zlim, add.values = TRUE, axes = FALSE, col = col, cex = cex) mtext(round(dist.subspace(B.true, B.normal, normalize = TRUE), 3), 3) matrixImage(round(B.gmlm, 2), main = "gmlm (ising)", zlim = zlim, add.values = TRUE, axes = FALSE, col = col, cex = cex) mtext(round(dist.subspace(B.true, B.gmlm, normalize = TRUE), 3), 3) }) }