make.gmlm.family <- function(name) { # standardize family name name <- list( normal = "normal", gaussian = "normal", bernoulli = "bernoulli", ising = "bernoulli" )[[tolower(name), exact = FALSE]] ############################################################################ # # # TODO: better (and possibly specialized) initial parameters!?!?!?! # # # ############################################################################ switch(name, ######################################################################## ### Tensor Normal ### ######################################################################## normal = { initialize <- function(X, Fy) { p <- head(dim(X), -1) r <- length(p) # Mode-Covariances XSigmas <- mcov(X, sample.axis = r + 1L) YSigmas <- mcov(Fy, sample.axis = r + 1L) # Extract main mode covariance directions # Note: (the directions are transposed!) XDirs <- Map(function(Sigma) { SVD <- La.svd(Sigma, nu = 0) sqrt(SVD$d) * SVD$vt }, XSigmas) YDirs <- Map(function(Sigma) { SVD <- La.svd(Sigma, nu = 0) sqrt(SVD$d) * SVD$vt }, YSigmas) alphas <- Map(function(xdir, ydir) { s <- min(ncol(xdir), nrow(ydir)) crossprod(xdir[seq_len(s), , drop = FALSE], ydir[seq_len(s), , drop = FALSE]) }, XDirs, YDirs) list( eta1 = rowMeans(X, dims = r), alphas = alphas, Omegas = Map(diag, p) ) } # parameters of the tensor normal computed from the GLM parameters params <- function(Fy, eta1, alphas, Omegas) { Deltas <- Map(solve, Omegas) mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas) list(mu_y = mu_y, Deltas = Deltas) } # scaled negative log-likelihood log.likelihood <- function(X, Fy, eta1, alphas, Omegas) { n <- tail(dim(X), 1) # sample size p <- head(dim(X), -1) # predictor dimensions # conditional mean mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas)) # negative log-likelihood scaled by sample size # Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)` # migth fail, but `NAGD` has failsaves againt cases of "illegal" # parameters. suppressWarnings( 0.5 * prod(p) * log(2 * pi) + sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) - (0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p) ) } # gradient of the scaled negative log-likelihood grad <- function(X, Fy, eta1, alphas, Omegas) { # retrieve dimensions n <- tail(dim(X), 1) # sample size p <- head(dim(X), -1) # predictor dimensions r <- length(p) # single predictor/response tensor order ## "Inverse" Link: Tensor Normal Specific # known exponential family constants c1 <- 1 c2 <- -0.5 # Covariances from the GLM parameter Scatter matrices Deltas <- Map(solve, Omegas) # First moment via "inverse" link `g1(eta_y) = E[X | Y = y]` E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas) # Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]` dim(E1) <- c(prod(p), n) E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1)) ## end "Inverse" Link dim(X) <- c(prod(p), n) # Residuals R <- X - E1 dim(R) <- c(p, n) # mean deviation between the sample covariance to GLM estimated covariance # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))` S <- rowMeans(colKronecker(X, X)) - E2 # <- Optimized for Tensor Normal dim(S) <- c(p, p) # reshape to tensor or order `2 r` # Gradients of the negative log-likelihood scaled by sample size list( "Dl(eta1)" = -c1 * rowMeans(R, dims = r), "Dl(alphas)" = Map(function(j) { (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j) }, 1:r), "Dl(Omegas)" = Map(function(j) { deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j])) # addapt to symmetric constraint for the derivative dim(deriv) <- c(p[j], p[j]) deriv + t(deriv * (1 - diag(p[j]))) }, 1:r) ) } # mean conditional Fisher Information fisher.info <- function(Fy, eta1, alphas, Omegas) { # retrieve dimensions n <- tail(dim(Fy), 1) # sample size p <- dim(eta1) # predictor dimensions q <- head(dim(Fy), -1) # response dimensions r <- length(p) # single predictor/response tensor order # Hij = Cov(ti(X) %x% tj(X) | Y = y), i, j = 1, 2 H11 <- Reduce(`%x%`, rev(Map(solve, Omegas))) # covariance # 3rd central moment is zero H12 <- H21 <- 0 # 4th moment by "Isserlis' theorem" a.k.a. "Wick's theorem" H22 <- kronecker(H11, H11) dim(H22) <- rep(prod(p), 4) H22 <- H22 + aperm(H22, c(1, 3, 2, 4)) + aperm(H22, c(1, 3, 2, 4)) dim(H11) <- c(p, p) dim(H22) <- c(p, p, p, p) ## Fisher Information: Tensor Normal Specific # known exponential family constants c1 <- 1 c2 <- -0.5 # list of (Fy x_{k in [r]\j} alpha_j) Gys <- Map(function(j) { mlm(Fy, alphas[-j], (1:r)[-j]) }, 1:r) # setup indices of lower triangular matrix elements used for # all tuples (j, l) with 1 <= j <= l <= r. ltri <- lower.tri(diag(r), TRUE) J <- .col(c(r, r))[ltri] L <- .row(c(r, r))[ltri] # inverse perfect outer shuffle of `2 r` elements iShuf <- rep(1:r, each = 2) + c(0, r) # Getter for the i'th observation from Gys[[j]] # TODO: this is ugly, find a better (but still dynamic) version GyGet <- function(i, j) { obs <- eval(str2lang(paste( "Gys[[j]][", paste(rep(",", r), collapse = ""), "i, drop = FALSE]" , collapse = ""))) dim(obs) <- head(dim(obs), -1) obs } I11 <- c1^2 * mat(H11, 1:r) I12 <- Map(function(j) { i11 <- ttt(Gys[[j]], H11, (1:r)[-j]) # take mean with respect to observations (second mode) i11 <- colMeans(mat(i11, 2)) dim(i11) <- c(q[j], p[j], p) t(mat(i11, 2:1)) }, 1:r) I13 <- Map(matrix, 0, prod(p), p^2) I22 <- Map(function(j, l) { i22 <- 0 for (i in 1:n) { i22 <- i22 + ttt( ttt(GyGet(i, j), H11, (1:r)[-j]), GyGet(i, l), (1:r)[-l] + 2, (1:r)[-l]) } (c1^2 / n) * mat(i22, 2:1) }, J, L) I23 <- Map(matrix, 0, (p * q)[J], (p^2)[L]) # shuffled H22 sH22 <- c2^2 * aperm(H22, c(iShuf, iShuf + 2 * r)) dim(sH22) <- c(p^2, p^2) I33 <- Map(function(j, l) { # second derivative (without constraints) deriv <- mlm(mlm(sH22, Map(c, Omegas[-j]), (1:r)[-j], transposed = TRUE), Map(c, Omegas[-l]), (1:r)[-l] + r, transposed = TRUE) dim(deriv) <- c(p[j], p[j], p[l], p[l]) # enforce symmetry of `Omega_j` of.diag <- (slice.index(deriv, 1:2) != slice.index(deriv, 2:1)) deriv <- deriv + of.diag * aperm(deriv, c(2, 1, 3, 4)) # as well as the symmetry of `Omega_l` of.diag <- (slice.index(deriv, 3:4) != slice.index(deriv, 4:3)) deriv <- deriv + of.diag * aperm(deriv, c(1, 2, 4, 3)) # matricize and return dim(deriv) <- c(p[j]^2, p[l]^2) deriv }, J, L) names(I12) <- sprintf("I(eta1, alpha_%d)", 1:r) names(I13) <- sprintf("I(eta1, Omega_%d)", 1:r) names(I22) <- sprintf("I(alpha_%d, alpha_%d)", J, L) names(I23) <- sprintf("I(alpha_%d, Omega_%d)", J, L) names(I33) <- sprintf("I(Omega_%d, Omega_%d)", J, L) list(I11 = I11, I12 = I12, I13 = I13, I22 = I22, I23 = I23, I33 = I33) } # Hessian of the scaled negative log-likelihood hessian <- function(X, Fy, eta1, alphas, Omegas) { stop("Not Implemented") } }, ######################################################################## ### Multi-Variate Bernoulli ### ######################################################################## bernoulli = { require(mvbernoulli) initialize <- function(X, Fy) { p <- head(dim(X), -1) r <- length(p) # Mode-Covariances XSigmas <- mcov(X, sample.axis = r + 1L) YSigmas <- mcov(Fy, sample.axis = r + 1L) # Extract main mode covariance directions # Note: (the directions are transposed!) XDirs <- Map(function(Sigma) { SVD <- La.svd(Sigma, nu = 0) sqrt(SVD$d) * SVD$vt }, XSigmas) YDirs <- Map(function(Sigma) { SVD <- La.svd(Sigma, nu = 0) sqrt(SVD$d) * SVD$vt }, YSigmas) alphas <- Map(function(xdir, ydir) { s <- min(ncol(xdir), nrow(ydir)) crossprod(xdir[seq_len(s), , drop = FALSE], ydir[seq_len(s), , drop = FALSE]) }, XDirs, YDirs) list( eta1 = array(0, dim = p), alphas = alphas, Omegas = Map(diag, p) ) } params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) { # number of observations n <- tail(dim(Fy), 1) # natural exponential family parameters eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1)) eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas)) # # next the conditional Ising model parameters `theta_y` # theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n) # dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n) # ltri <- which(lower.tri(eta_y2, diag = TRUE)) # diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri]) # theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1) # theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ] # conditional Ising model parameters theta_y <- matrix(rep(vech(eta_y2), n), ncol = n) ltri <- which(lower.tri(eta_y2, diag = TRUE)) diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri]) theta_y[diagonal, ] <- eta_y1 theta_y } # Scaled ngative log-likelihood log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) { # number of observations n <- tail(dim(X), 1L) # conditional Ising model parameters theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2) # convert to binary data set storage.mode(X) <- "integer" X.mvb <- as.mvbinary(mat(X, length(dim(X)))) # log-likelihood of the data set -mean(sapply(seq_len(n), function(i) { ising_log_likelihood(theta_y[, i], X.mvb[i]) })) } # Gradient of the scaled negative log-likelihood grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) { # retrieve dimensions n <- tail(dim(X), 1) # sample size p <- head(dim(X), -1) # predictor dimensions r <- length(p) # single predictor/response tensor order ## "Inverse" Link: Ising Model Specific # conditional Ising model parameters: `p (p + 1) / 2` by `n` theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2) # conditional expectations # ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y] E2 <- apply(theta_y, 2L, ising_marginal_probs) # convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y] E2 <- E2[vech.pinv.index(prod(p)), ] # extract diagonal elements which are equal to E[vec(X) | Y = y] E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ] ## end "Inverse" Link dim(X) <- c(prod(p), n) # Residuals R <- X - E1 dim(R) <- c(p, n) # mean deviation between the sample covariance to GLM estimated covariance # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))` S <- rowMeans(colKronecker(X, X) - E2) dim(S) <- c(p, p) # reshape to tensor or order `2 r` # Gradients of the negative log-likelihood scaled by sample size list( "Dl(eta1)" = -c1 * rowMeans(R, dims = r), "Dl(alphas)" = Map(function(j) { (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j) }, 1:r), "Dl(Omegas)" = Map(function(j) { deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j])) # addapt to symmetric constraint for the derivative dim(deriv) <- c(p[j], p[j]) deriv + t(deriv * (1 - diag(p[j]))) }, 1:r) ) } # Hessian of the scaled negative log-likelihood hessian <- function(X, Fy, alphas, Omegas, c1 = 1, c2 = 1) { stop("Not Implemented") } # Conditional Fisher Information fisher.info <- function(Fy, alphas, Omegas) { stop("Not Implemented") } } ) list( family = name, initialize = initialize, params = params, # linkinv = linkinv, log.likelihood = log.likelihood, grad = grad, hessian = hessian, fisher.info = fisher.info ) }