#' Fitting Generalized Multi-Linear Models #' #' @export GMLM <- function(...) { stop("Not Implemented") } make.gmlm.family <- function(name) { # standardize family name name <- list( normal = "normal", gaussian = "normal", bernoulli = "bernoulli", ising = "bernoulli" )[[tolower(name), exact = FALSE]] ############################################################################ # # # TODO: better (and possibly specialized) initial parameters!?!?!?! # # # ############################################################################ switch(name, normal = { initialize <- function(X, Fy) { p <- head(dim(X), -1) r <- length(p) # Mode-Covariances XSigmas <- mcov(X, sample.axis = r + 1L) YSigmas <- mcov(Fy, sample.axis = r + 1L) # Extract main mode covariance directions # Note: (the directions are transposed!) XDirs <- Map(function(Sigma) { with(La.svd(Sigma, nu = 0), sqrt(d) * vt) }, XSigmas) YDirs <- Map(function(Sigma) { with(La.svd(Sigma, nu = 0), sqrt(d) * vt) }, YSigmas) alphas <- Map(function(xdir, ydir) { s <- min(ncol(xdir), nrow(ydir)) crossprod(xdir[seq_len(s), , drop = FALSE], ydir[seq_len(s), , drop = FALSE]) }, XDirs, YDirs) list( eta1 = rowMeans(X, dims = r), alphas = alphas, Omegas = Map(diag, p) ) } # parameters of the tensor normal computed from the GLM parameters params <- function(Fy, eta1, alphas, Omegas) { Deltas <- Map(solve, Omegas) mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas) list(mu_y = mu_y, Deltas = Deltas) } # scaled negative log-likelihood log.likelihood <- function(X, Fy, eta1, alphas, Omegas) { n <- tail(dim(X), 1) # sample size # conditional mean mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas)) # negative log-likelihood scaled by sample size # Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)` # migth fail, but `NAGD` has failsaves againt cases of "illegal" # parameters. suppressWarnings( 0.5 * prod(p) * log(2 * pi) + sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) - (0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p) ) } # gradient of the scaled negative log-likelihood grad <- function(X, Fy, eta1, alphas, Omegas) { # retrieve dimensions n <- tail(dim(X), 1) # sample size p <- head(dim(X), -1) # predictor dimensions q <- head(dim(Fy), -1) # response dimensions r <- length(p) # single predictor/response tensor order ## "Inverse" Link: Tensor Normal Specific # known exponential family constants c1 <- 1 c2 <- -0.5 # Covariances from the GLM parameter Scatter matrices Deltas <- Map(solve, Omegas) # First moment via "inverse" link `g1(eta_y) = E[X | Y = y]` E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas) # Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]` dim(E1) <- c(prod(p), n) E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1)) ## end "Inverse" Link dim(X) <- c(prod(p), n) # Residuals R <- X - E1 dim(R) <- c(p, n) # mean deviation between the sample covariance to GLM estimated covariance # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))` S <- rowMeans(colKronecker(X, X)) - E2 # <- Optimized for Tensor Normal dim(S) <- c(p, p) # reshape to tensor or order `2 r` # Gradients of the negative log-likelihood scaled by sample size list( "Dl(eta1)" = -c1 * rowMeans(R, dims = r), "Dl(alphas)" = Map(function(j) { (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j) }, 1:r), "Dl(Omegas)" = Map(function(j) { deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j])) # addapt to symmetric constraint for the derivative dim(deriv) <- c(p[j], p[j]) deriv + t(deriv * (1 - diag(p[j]))) }, 1:r) ) } }, bernoulli = { require(mvbernoulli) initialize <- function(X, Fy) { p <- head(dim(X), -1) r <- length(p) # Mode-Covariances XSigmas <- mcov(X, sample.axis = r + 1L) YSigmas <- mcov(Fy, sample.axis = r + 1L) # Extract main mode covariance directions # Note: (the directions are transposed!) XDirs <- Map(function(Sigma) { with(La.svd(Sigma, nu = 0), sqrt(d) * vt) }, XSigmas) YDirs <- Map(function(Sigma) { with(La.svd(Sigma, nu = 0), sqrt(d) * vt) }, YSigmas) alphas <- Map(function(xdir, ydir) { s <- min(ncol(xdir), nrow(ydir)) crossprod(xdir[seq_len(s), , drop = FALSE], ydir[seq_len(s), , drop = FALSE]) }, XDirs, YDirs) list( eta1 = array(0, dim = p), alphas = alphas, Omegas = Map(diag, p) ) } # initialize <- function(X, Fy) { # r <- length(dim(X)) - 1L # # Mode-Covariances # XSigmas <- mcov(X, sample.axis = r + 1L) # YSigmas <- mcov(Fy, sample.axis = r + 1L) # # Extract main mode covariance directions # # Note: (the directions are transposed!) # XDirs <- Map(function(Sigma) { # with(La.svd(Sigma, nu = 0), sqrt(d) * vt) # }, XSigmas) # YDirs <- Map(function(Sigma) { # with(La.svd(Sigma, nu = 0), sqrt(d) * vt) # }, YSigmas) # alphas <- Map(function(xdir, ydir) { # s <- min(ncol(xdir), nrow(ydir)) # crossprod(xdir[seq_len(s), , drop = FALSE], # ydir[seq_len(s), , drop = FALSE]) # }, XDirs, YDirs) # # Scatter matrices from Residuals (intercept not considered) # Deltas <- mcov(X - mlm(Fy, alphas), sample.axis = r + 1L) # Omegas <- Map(solve, Deltas) # # and the intercept # eta1 <- mlm(rowMeans(X, dims = r), Deltas) # list( # eta1 = eta1, # alphas = alphas, # Omegas = Omegas # ) # } params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) { # number of observations n <- tail(dim(Fy), 1) # natural exponential family parameters eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1)) eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas)) # # next the conditional Ising model parameters `theta_y` # theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n) # dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n) # ltri <- which(lower.tri(eta_y2, diag = TRUE)) # diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri]) # theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1) # theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ] # conditional Ising model parameters theta_y <- matrix(rep(vech(eta_y2), n), ncol = n) ltri <- which(lower.tri(eta_y2, diag = TRUE)) diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri]) theta_y[diagonal, ] <- eta_y1 theta_y } # Scaled ngative log-likelihood log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) { # number of observations n <- tail(dim(X), 1L) # conditional Ising model parameters theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2) # convert to binary data set storage.mode(X) <- "integer" X.mvb <- as.mvbinary(mat(X, length(dim(X)))) # log-likelihood of the data set -mean(sapply(seq_len(n), function(i) { ising_log_likelihood(theta_y[, i], X.mvb[i]) })) } # Gradient of the scaled negative log-likelihood grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) { # retrieve dimensions n <- tail(dim(X), 1) # sample size p <- head(dim(X), -1) # predictor dimensions q <- head(dim(Fy), -1) # response dimensions r <- length(p) # single predictor/response tensor order ## "Inverse" Link: Ising Model Specific # conditional Ising model parameters: `p (p + 1) / 2` by `n` theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2) # conditional expectations # ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y] E2 <- apply(theta_y, 2L, ising_marginal_probs) # convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y] E2 <- E2[vech.pinv.index(prod(p)), ] # extract diagonal elements which are equal to E[vec(X) | Y = y] E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ] ## end "Inverse" Link dim(X) <- c(prod(p), n) # Residuals R <- X - E1 dim(R) <- c(p, n) # mean deviation between the sample covariance to GLM estimated covariance # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))` S <- rowMeans(colKronecker(X, X) - E2) dim(S) <- c(p, p) # reshape to tensor or order `2 r` # Gradients of the negative log-likelihood scaled by sample size list( "Dl(eta1)" = -c1 * rowMeans(R, dims = r), "Dl(alphas)" = Map(function(j) { (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j) }, 1:r), "Dl(Omegas)" = Map(function(j) { deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j])) # addapt to symmetric constraint for the derivative dim(deriv) <- c(p[j], p[j]) deriv + t(deriv * (1 - diag(p[j]))) }, 1:r) ) } } ) list( family = name, initialize = initialize, params = params, # linkinv = linkinv, log.likelihood = log.likelihood, grad = grad ) } #' @export GMLM.default <- function(X, Fy, sample.axis = 1L, family = "normal", ..., eps = sqrt(.Machine$double.eps), logger = NULL ) { stopifnot(exprs = { length(sample.axis) == 1L (1L <= sample.axis) && (sample.axis <= length(dim(X))) (dim(X) == dim(Fy))[sample.axis] }) # rearrange `X`, `Fy` such that the last axis enumerates observations axis.perm <- c(seq_along(dim(X))[-sample.axis], sample.axis) X <- aperm(X, axis.perm) Fy <- aperm(Fy, axis.perm) # setup family specific GLM (pseudo) "inverse" link family <- make.gmlm.family(family) # wrap logger in callback for NAGD callback <- if (is.function(logger)) { function(iter, params) { do.call(logger, c(list(iter), params)) } } params.fit <- NAGD( fun.loss = function(params) { # scaled negative lig-likelihood # eta1 alphas Omegas family$log.likelihood(X, Fy, params[[1]], params[[2]], params[[3]]) }, fun.grad = function(params) { # gradient of the scaled negative lig-likelihood # eta1 alphas Omegas family$grad(X, Fy, params[[1]], params[[2]], params[[3]]) }, params = family$initialize(X, Fy), # initialen parameter estimates fun.lincomb = function(a, lhs, b, rhs) { list( a * lhs[[1]] + b * rhs[[1]], Map(function(l, r) a * l + b * r, lhs[[2]], rhs[[2]]), Map(function(l, r) a * l + b * r, lhs[[3]], rhs[[3]]) ) }, fun.norm2 = function(params) { sum(unlist(params)^2) }, callback = callback, ... ) structure(params.fit, names = c("eta1", "alphas", "Omegas")) }