357 lines
14 KiB
R
357 lines
14 KiB
R
#' Fitting Generalized Multi-Linear Models
|
|
#'
|
|
#' @export
|
|
GMLM <- function(...) {
|
|
stop("Not Implemented")
|
|
}
|
|
|
|
|
|
make.gmlm.family <- function(name) {
|
|
# standardize family name
|
|
name <- list(
|
|
normal = "normal", gaussian = "normal",
|
|
bernoulli = "bernoulli", ising = "bernoulli"
|
|
)[[tolower(name), exact = FALSE]]
|
|
|
|
############################################################################
|
|
# #
|
|
# TODO: better (and possibly specialized) initial parameters!?!?!?! #
|
|
# #
|
|
############################################################################
|
|
|
|
switch(name,
|
|
normal = {
|
|
initialize <- function(X, Fy) {
|
|
p <- head(dim(X), -1)
|
|
r <- length(p)
|
|
|
|
# Mode-Covariances
|
|
XSigmas <- mcov(X, sample.axis = r + 1L)
|
|
YSigmas <- mcov(Fy, sample.axis = r + 1L)
|
|
|
|
# Extract main mode covariance directions
|
|
# Note: (the directions are transposed!)
|
|
XDirs <- Map(function(Sigma) {
|
|
with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
|
|
}, XSigmas)
|
|
YDirs <- Map(function(Sigma) {
|
|
with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
|
|
}, YSigmas)
|
|
|
|
alphas <- Map(function(xdir, ydir) {
|
|
s <- min(ncol(xdir), nrow(ydir))
|
|
crossprod(xdir[seq_len(s), , drop = FALSE],
|
|
ydir[seq_len(s), , drop = FALSE])
|
|
}, XDirs, YDirs)
|
|
|
|
list(
|
|
eta1 = rowMeans(X, dims = r),
|
|
alphas = alphas,
|
|
Omegas = Map(diag, p)
|
|
)
|
|
}
|
|
|
|
# parameters of the tensor normal computed from the GLM parameters
|
|
params <- function(Fy, eta1, alphas, Omegas) {
|
|
Deltas <- Map(solve, Omegas)
|
|
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
|
|
list(mu_y = mu_y, Deltas = Deltas)
|
|
}
|
|
|
|
# scaled negative log-likelihood
|
|
log.likelihood <- function(X, Fy, eta1, alphas, Omegas) {
|
|
n <- tail(dim(X), 1) # sample size
|
|
|
|
# conditional mean
|
|
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas))
|
|
|
|
# negative log-likelihood scaled by sample size
|
|
# Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)`
|
|
# migth fail, but `NAGD` has failsaves againt cases of "illegal"
|
|
# parameters.
|
|
suppressWarnings(
|
|
0.5 * prod(p) * log(2 * pi) +
|
|
sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) -
|
|
(0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p)
|
|
)
|
|
}
|
|
# gradient of the scaled negative log-likelihood
|
|
grad <- function(X, Fy, eta1, alphas, Omegas) {
|
|
# retrieve dimensions
|
|
n <- tail(dim(X), 1) # sample size
|
|
p <- head(dim(X), -1) # predictor dimensions
|
|
q <- head(dim(Fy), -1) # response dimensions
|
|
r <- length(p) # single predictor/response tensor order
|
|
|
|
## "Inverse" Link: Tensor Normal Specific
|
|
# known exponential family constants
|
|
c1 <- 1
|
|
c2 <- -0.5
|
|
|
|
# Covariances from the GLM parameter Scatter matrices
|
|
Deltas <- Map(solve, Omegas)
|
|
|
|
# First moment via "inverse" link `g1(eta_y) = E[X | Y = y]`
|
|
E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
|
|
|
|
# Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]`
|
|
dim(E1) <- c(prod(p), n)
|
|
E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1))
|
|
## end "Inverse" Link
|
|
|
|
dim(X) <- c(prod(p), n)
|
|
|
|
# Residuals
|
|
R <- X - E1
|
|
dim(R) <- c(p, n)
|
|
|
|
# mean deviation between the sample covariance to GLM estimated covariance
|
|
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
|
|
S <- rowMeans(colKronecker(X, X)) - E2 # <- Optimized for Tensor Normal
|
|
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
|
|
|
|
# Gradients of the negative log-likelihood scaled by sample size
|
|
list(
|
|
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
|
|
"Dl(alphas)" = Map(function(j) {
|
|
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
|
|
}, 1:r),
|
|
"Dl(Omegas)" = Map(function(j) {
|
|
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
|
|
# addapt to symmetric constraint for the derivative
|
|
dim(deriv) <- c(p[j], p[j])
|
|
deriv + t(deriv * (1 - diag(p[j])))
|
|
}, 1:r)
|
|
)
|
|
}
|
|
},
|
|
bernoulli = {
|
|
require(mvbernoulli)
|
|
|
|
initialize <- function(X, Fy) {
|
|
p <- head(dim(X), -1)
|
|
r <- length(p)
|
|
|
|
# Mode-Covariances
|
|
XSigmas <- mcov(X, sample.axis = r + 1L)
|
|
YSigmas <- mcov(Fy, sample.axis = r + 1L)
|
|
|
|
# Extract main mode covariance directions
|
|
# Note: (the directions are transposed!)
|
|
XDirs <- Map(function(Sigma) {
|
|
with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
|
|
}, XSigmas)
|
|
YDirs <- Map(function(Sigma) {
|
|
with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
|
|
}, YSigmas)
|
|
|
|
alphas <- Map(function(xdir, ydir) {
|
|
s <- min(ncol(xdir), nrow(ydir))
|
|
crossprod(xdir[seq_len(s), , drop = FALSE],
|
|
ydir[seq_len(s), , drop = FALSE])
|
|
}, XDirs, YDirs)
|
|
|
|
list(
|
|
eta1 = array(0, dim = p),
|
|
alphas = alphas,
|
|
Omegas = Map(diag, p)
|
|
)
|
|
}
|
|
|
|
# initialize <- function(X, Fy) {
|
|
# r <- length(dim(X)) - 1L
|
|
|
|
# # Mode-Covariances
|
|
# XSigmas <- mcov(X, sample.axis = r + 1L)
|
|
# YSigmas <- mcov(Fy, sample.axis = r + 1L)
|
|
|
|
# # Extract main mode covariance directions
|
|
# # Note: (the directions are transposed!)
|
|
# XDirs <- Map(function(Sigma) {
|
|
# with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
|
|
# }, XSigmas)
|
|
# YDirs <- Map(function(Sigma) {
|
|
# with(La.svd(Sigma, nu = 0), sqrt(d) * vt)
|
|
# }, YSigmas)
|
|
|
|
# alphas <- Map(function(xdir, ydir) {
|
|
# s <- min(ncol(xdir), nrow(ydir))
|
|
# crossprod(xdir[seq_len(s), , drop = FALSE],
|
|
# ydir[seq_len(s), , drop = FALSE])
|
|
# }, XDirs, YDirs)
|
|
|
|
# # Scatter matrices from Residuals (intercept not considered)
|
|
# Deltas <- mcov(X - mlm(Fy, alphas), sample.axis = r + 1L)
|
|
# Omegas <- Map(solve, Deltas)
|
|
|
|
# # and the intercept
|
|
# eta1 <- mlm(rowMeans(X, dims = r), Deltas)
|
|
|
|
# list(
|
|
# eta1 = eta1,
|
|
# alphas = alphas,
|
|
# Omegas = Omegas
|
|
# )
|
|
# }
|
|
|
|
params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
|
|
# number of observations
|
|
n <- tail(dim(Fy), 1)
|
|
|
|
# natural exponential family parameters
|
|
eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1))
|
|
eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas))
|
|
|
|
# # next the conditional Ising model parameters `theta_y`
|
|
# theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n)
|
|
# dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n)
|
|
|
|
# ltri <- which(lower.tri(eta_y2, diag = TRUE))
|
|
# diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
|
|
# theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1)
|
|
# theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ]
|
|
|
|
# conditional Ising model parameters
|
|
theta_y <- matrix(rep(vech(eta_y2), n), ncol = n)
|
|
ltri <- which(lower.tri(eta_y2, diag = TRUE))
|
|
diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
|
|
theta_y[diagonal, ] <- eta_y1
|
|
|
|
theta_y
|
|
}
|
|
|
|
# Scaled ngative log-likelihood
|
|
log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
|
|
# number of observations
|
|
n <- tail(dim(X), 1L)
|
|
|
|
# conditional Ising model parameters
|
|
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
|
|
|
|
# convert to binary data set
|
|
storage.mode(X) <- "integer"
|
|
X.mvb <- as.mvbinary(mat(X, length(dim(X))))
|
|
|
|
# log-likelihood of the data set
|
|
-mean(sapply(seq_len(n), function(i) {
|
|
ising_log_likelihood(theta_y[, i], X.mvb[i])
|
|
}))
|
|
}
|
|
|
|
# Gradient of the scaled negative log-likelihood
|
|
grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
|
|
# retrieve dimensions
|
|
n <- tail(dim(X), 1) # sample size
|
|
p <- head(dim(X), -1) # predictor dimensions
|
|
q <- head(dim(Fy), -1) # response dimensions
|
|
r <- length(p) # single predictor/response tensor order
|
|
|
|
## "Inverse" Link: Ising Model Specific
|
|
# conditional Ising model parameters: `p (p + 1) / 2` by `n`
|
|
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
|
|
|
|
# conditional expectations
|
|
# ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y]
|
|
E2 <- apply(theta_y, 2L, ising_marginal_probs)
|
|
# convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y]
|
|
E2 <- E2[vech.pinv.index(prod(p)), ]
|
|
# extract diagonal elements which are equal to E[vec(X) | Y = y]
|
|
E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ]
|
|
## end "Inverse" Link
|
|
|
|
dim(X) <- c(prod(p), n)
|
|
|
|
# Residuals
|
|
R <- X - E1
|
|
dim(R) <- c(p, n)
|
|
|
|
# mean deviation between the sample covariance to GLM estimated covariance
|
|
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
|
|
S <- rowMeans(colKronecker(X, X) - E2)
|
|
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
|
|
|
|
# Gradients of the negative log-likelihood scaled by sample size
|
|
list(
|
|
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
|
|
"Dl(alphas)" = Map(function(j) {
|
|
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
|
|
}, 1:r),
|
|
"Dl(Omegas)" = Map(function(j) {
|
|
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
|
|
# addapt to symmetric constraint for the derivative
|
|
dim(deriv) <- c(p[j], p[j])
|
|
deriv + t(deriv * (1 - diag(p[j])))
|
|
}, 1:r)
|
|
)
|
|
}
|
|
}
|
|
)
|
|
|
|
list(
|
|
family = name,
|
|
initialize = initialize,
|
|
params = params,
|
|
# linkinv = linkinv,
|
|
log.likelihood = log.likelihood,
|
|
grad = grad
|
|
)
|
|
}
|
|
|
|
|
|
#' @export
|
|
GMLM.default <- function(X, Fy, sample.axis = 1L,
|
|
family = "normal",
|
|
...,
|
|
eps = sqrt(.Machine$double.eps),
|
|
logger = NULL
|
|
) {
|
|
stopifnot(exprs = {
|
|
length(sample.axis) == 1L
|
|
(1L <= sample.axis) && (sample.axis <= length(dim(X)))
|
|
(dim(X) == dim(Fy))[sample.axis]
|
|
})
|
|
|
|
# rearrange `X`, `Fy` such that the last axis enumerates observations
|
|
axis.perm <- c(seq_along(dim(X))[-sample.axis], sample.axis)
|
|
X <- aperm(X, axis.perm)
|
|
Fy <- aperm(Fy, axis.perm)
|
|
|
|
# setup family specific GLM (pseudo) "inverse" link
|
|
family <- make.gmlm.family(family)
|
|
|
|
# wrap logger in callback for NAGD
|
|
callback <- if (is.function(logger)) {
|
|
function(iter, params) {
|
|
do.call(logger, c(list(iter), params))
|
|
}
|
|
}
|
|
|
|
params.fit <- NAGD(
|
|
fun.loss = function(params) {
|
|
# scaled negative lig-likelihood
|
|
# eta1 alphas Omegas
|
|
family$log.likelihood(X, Fy, params[[1]], params[[2]], params[[3]])
|
|
},
|
|
fun.grad = function(params) {
|
|
# gradient of the scaled negative lig-likelihood
|
|
# eta1 alphas Omegas
|
|
family$grad(X, Fy, params[[1]], params[[2]], params[[3]])
|
|
},
|
|
params = family$initialize(X, Fy), # initialen parameter estimates
|
|
fun.lincomb = function(a, lhs, b, rhs) {
|
|
list(
|
|
a * lhs[[1]] + b * rhs[[1]],
|
|
Map(function(l, r) a * l + b * r, lhs[[2]], rhs[[2]]),
|
|
Map(function(l, r) a * l + b * r, lhs[[3]], rhs[[3]])
|
|
)
|
|
},
|
|
fun.norm2 = function(params) {
|
|
sum(unlist(params)^2)
|
|
},
|
|
callback = callback,
|
|
...
|
|
)
|
|
|
|
structure(params.fit, names = c("eta1", "alphas", "Omegas"))
|
|
}
|