tensor_predictors/tensorPredictors/R/make_gmlm_family.R

388 lines
16 KiB
R

make.gmlm.family <- function(name) {
# standardize family name
name <- list(
normal = "normal", gaussian = "normal",
bernoulli = "bernoulli", ising = "bernoulli"
)[[tolower(name), exact = FALSE]]
############################################################################
# #
# TODO: better (and possibly specialized) initial parameters!?!?!?! #
# #
############################################################################
switch(name,
########################################################################
### Tensor Normal ###
########################################################################
normal = {
initialize <- function(X, Fy) {
p <- head(dim(X), -1)
r <- length(p)
# Mode-Covariances
XSigmas <- mcov(X, sample.axis = r + 1L)
YSigmas <- mcov(Fy, sample.axis = r + 1L)
# Extract main mode covariance directions
# Note: (the directions are transposed!)
XDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, XSigmas)
YDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, YSigmas)
alphas <- Map(function(xdir, ydir) {
s <- min(ncol(xdir), nrow(ydir))
crossprod(xdir[seq_len(s), , drop = FALSE],
ydir[seq_len(s), , drop = FALSE])
}, XDirs, YDirs)
list(
eta1 = rowMeans(X, dims = r),
alphas = alphas,
Omegas = Map(diag, p)
)
}
# parameters of the tensor normal computed from the GLM parameters
params <- function(Fy, eta1, alphas, Omegas) {
Deltas <- Map(solve, Omegas)
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
list(mu_y = mu_y, Deltas = Deltas)
}
# scaled negative log-likelihood
log.likelihood <- function(X, Fy, eta1, alphas, Omegas) {
n <- tail(dim(X), 1) # sample size
p <- head(dim(X), -1) # predictor dimensions
# conditional mean
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas))
# negative log-likelihood scaled by sample size
# Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)`
# migth fail, but `NAGD` has failsaves againt cases of "illegal"
# parameters.
suppressWarnings(
0.5 * prod(p) * log(2 * pi) +
sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) -
(0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p)
)
}
# gradient of the scaled negative log-likelihood
grad <- function(X, Fy, eta1, alphas, Omegas) {
# retrieve dimensions
n <- tail(dim(X), 1) # sample size
p <- head(dim(X), -1) # predictor dimensions
r <- length(p) # single predictor/response tensor order
## "Inverse" Link: Tensor Normal Specific
# known exponential family constants
c1 <- 1
c2 <- -0.5
# Covariances from the GLM parameter Scatter matrices
Deltas <- Map(solve, Omegas)
# First moment via "inverse" link `g1(eta_y) = E[X | Y = y]`
E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
# Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]`
dim(E1) <- c(prod(p), n)
E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1))
## end "Inverse" Link
dim(X) <- c(prod(p), n)
# Residuals
R <- X - E1
dim(R) <- c(p, n)
# mean deviation between the sample covariance to GLM estimated covariance
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
S <- rowMeans(colKronecker(X, X)) - E2 # <- Optimized for Tensor Normal
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
# Gradients of the negative log-likelihood scaled by sample size
list(
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
"Dl(alphas)" = Map(function(j) {
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
}, 1:r),
"Dl(Omegas)" = Map(function(j) {
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
# addapt to symmetric constraint for the derivative
dim(deriv) <- c(p[j], p[j])
deriv + t(deriv * (1 - diag(p[j])))
}, 1:r)
)
}
# mean conditional Fisher Information
fisher.info <- function(Fy, eta1, alphas, Omegas) {
# retrieve dimensions
n <- tail(dim(Fy), 1) # sample size
p <- dim(eta1) # predictor dimensions
q <- head(dim(Fy), -1) # response dimensions
r <- length(p) # single predictor/response tensor order
# Hij = Cov(ti(X) %x% tj(X) | Y = y), i, j = 1, 2
H11 <- Reduce(`%x%`, rev(Map(solve, Omegas))) # covariance
# 3rd central moment is zero
H12 <- H21 <- 0
# 4th moment by "Isserlis' theorem" a.k.a. "Wick's theorem"
H22 <- kronecker(H11, H11)
dim(H22) <- rep(prod(p), 4)
H22 <- H22 + aperm(H22, c(1, 3, 2, 4)) + aperm(H22, c(1, 3, 2, 4))
dim(H11) <- c(p, p)
dim(H22) <- c(p, p, p, p)
## Fisher Information: Tensor Normal Specific
# known exponential family constants
c1 <- 1
c2 <- -0.5
# list of (Fy x_{k in [r]\j} alpha_j)
Gys <- Map(function(j) {
mlm(Fy, alphas[-j], (1:r)[-j])
}, 1:r)
# setup indices of lower triangular matrix elements used for
# all tuples (j, l) with 1 <= j <= l <= r.
ltri <- lower.tri(diag(r), TRUE)
J <- .col(c(r, r))[ltri]
L <- .row(c(r, r))[ltri]
# inverse perfect outer shuffle of `2 r` elements
iShuf <- rep(1:r, each = 2) + c(0, r)
# Getter for the i'th observation from Gys[[j]]
# TODO: this is ugly, find a better (but still dynamic) version
GyGet <- function(i, j) {
obs <- eval(str2lang(paste(
"Gys[[j]][",
paste(rep(",", r), collapse = ""),
"i, drop = FALSE]"
, collapse = "")))
dim(obs) <- head(dim(obs), -1)
obs
}
I11 <- c1^2 * mat(H11, 1:r)
I12 <- Map(function(j) {
i11 <- ttt(Gys[[j]], H11, (1:r)[-j])
# take mean with respect to observations (second mode)
i11 <- colMeans(mat(i11, 2))
dim(i11) <- c(q[j], p[j], p)
t(mat(i11, 2:1))
}, 1:r)
I13 <- Map(matrix, 0, prod(p), p^2)
I22 <- Map(function(j, l) {
i22 <- 0
for (i in 1:n) {
i22 <- i22 + ttt(
ttt(GyGet(i, j), H11, (1:r)[-j]),
GyGet(i, l), (1:r)[-l] + 2, (1:r)[-l])
}
(c1^2 / n) * mat(i22, 2:1)
}, J, L)
I23 <- Map(matrix, 0, (p * q)[J], (p^2)[L])
# shuffled H22
sH22 <- c2^2 * aperm(H22, c(iShuf, iShuf + 2 * r))
dim(sH22) <- c(p^2, p^2)
I33 <- Map(function(j, l) {
# second derivative (without constraints)
deriv <- mlm(mlm(sH22, Map(c, Omegas[-j]), (1:r)[-j],
transposed = TRUE),
Map(c, Omegas[-l]), (1:r)[-l] + r,
transposed = TRUE)
dim(deriv) <- c(p[j], p[j], p[l], p[l])
# enforce symmetry of `Omega_j`
of.diag <- (slice.index(deriv, 1:2) != slice.index(deriv, 2:1))
deriv <- deriv + of.diag * aperm(deriv, c(2, 1, 3, 4))
# as well as the symmetry of `Omega_l`
of.diag <- (slice.index(deriv, 3:4) != slice.index(deriv, 4:3))
deriv <- deriv + of.diag * aperm(deriv, c(1, 2, 4, 3))
# matricize and return
dim(deriv) <- c(p[j]^2, p[l]^2)
deriv
}, J, L)
names(I12) <- sprintf("I(eta1, alpha_%d)", 1:r)
names(I13) <- sprintf("I(eta1, Omega_%d)", 1:r)
names(I22) <- sprintf("I(alpha_%d, alpha_%d)", J, L)
names(I23) <- sprintf("I(alpha_%d, Omega_%d)", J, L)
names(I33) <- sprintf("I(Omega_%d, Omega_%d)", J, L)
list(I11 = I11, I12 = I12, I13 = I13, I22 = I22, I23 = I23, I33 = I33)
}
# Hessian of the scaled negative log-likelihood
hessian <- function(X, Fy, eta1, alphas, Omegas) {
stop("Not Implemented")
}
},
########################################################################
### Multi-Variate Bernoulli ###
########################################################################
bernoulli = {
require(mvbernoulli)
initialize <- function(X, Fy) {
p <- head(dim(X), -1)
r <- length(p)
# Mode-Covariances
XSigmas <- mcov(X, sample.axis = r + 1L)
YSigmas <- mcov(Fy, sample.axis = r + 1L)
# Extract main mode covariance directions
# Note: (the directions are transposed!)
XDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, XSigmas)
YDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, YSigmas)
alphas <- Map(function(xdir, ydir) {
s <- min(ncol(xdir), nrow(ydir))
crossprod(xdir[seq_len(s), , drop = FALSE],
ydir[seq_len(s), , drop = FALSE])
}, XDirs, YDirs)
list(
eta1 = array(0, dim = p),
alphas = alphas,
Omegas = Map(diag, p)
)
}
params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
# number of observations
n <- tail(dim(Fy), 1)
# natural exponential family parameters
eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1))
eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas))
# # next the conditional Ising model parameters `theta_y`
# theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n)
# dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n)
# ltri <- which(lower.tri(eta_y2, diag = TRUE))
# diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
# theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1)
# theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ]
# conditional Ising model parameters
theta_y <- matrix(rep(vech(eta_y2), n), ncol = n)
ltri <- which(lower.tri(eta_y2, diag = TRUE))
diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
theta_y[diagonal, ] <- eta_y1
theta_y
}
# Scaled ngative log-likelihood
log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
# number of observations
n <- tail(dim(X), 1L)
# conditional Ising model parameters
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
# convert to binary data set
storage.mode(X) <- "integer"
X.mvb <- as.mvbinary(mat(X, length(dim(X))))
# log-likelihood of the data set
-mean(sapply(seq_len(n), function(i) {
ising_log_likelihood(theta_y[, i], X.mvb[i])
}))
}
# Gradient of the scaled negative log-likelihood
grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
# retrieve dimensions
n <- tail(dim(X), 1) # sample size
p <- head(dim(X), -1) # predictor dimensions
r <- length(p) # single predictor/response tensor order
## "Inverse" Link: Ising Model Specific
# conditional Ising model parameters: `p (p + 1) / 2` by `n`
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
# conditional expectations
# ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y]
E2 <- apply(theta_y, 2L, ising_marginal_probs)
# convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y]
E2 <- E2[vech.pinv.index(prod(p)), ]
# extract diagonal elements which are equal to E[vec(X) | Y = y]
E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ]
## end "Inverse" Link
dim(X) <- c(prod(p), n)
# Residuals
R <- X - E1
dim(R) <- c(p, n)
# mean deviation between the sample covariance to GLM estimated covariance
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
S <- rowMeans(colKronecker(X, X) - E2)
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
# Gradients of the negative log-likelihood scaled by sample size
list(
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
"Dl(alphas)" = Map(function(j) {
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
}, 1:r),
"Dl(Omegas)" = Map(function(j) {
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
# addapt to symmetric constraint for the derivative
dim(deriv) <- c(p[j], p[j])
deriv + t(deriv * (1 - diag(p[j])))
}, 1:r)
)
}
# Hessian of the scaled negative log-likelihood
hessian <- function(X, Fy, alphas, Omegas, c1 = 1, c2 = 1) {
stop("Not Implemented")
}
# Conditional Fisher Information
fisher.info <- function(Fy, alphas, Omegas) {
stop("Not Implemented")
}
}
)
list(
family = name,
initialize = initialize,
params = params,
# linkinv = linkinv,
log.likelihood = log.likelihood,
grad = grad,
hessian = hessian,
fisher.info = fisher.info
)
}