fix: some typos in GMLM.tex,

add: extended interface of num.deriv and matrixImage,
add: "tests",
add: moved make.gmlm.family into seperate file
This commit is contained in:
Daniel Kapla 2022-10-31 15:14:58 +01:00
parent 79f16c58dc
commit 2dfdc7083b
11 changed files with 746 additions and 298 deletions

View File

@ -139,11 +139,12 @@
\maketitle
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{abstract}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We consider regression and classification for \textit{general} response and tensor-valued predictors (multi dimensional arrays) and propose a \textit{novel formulation} for method for sufficient dimension reduction. Assumingof Tensor-valued predictor (multi dimensional arrays) for regression or classification. the distribution of the tensor-valued predictors given the response is in theWe assume an qQuadratic eExponential fFamily, we model the natural parameter for a Generalized Linear Model in an inverse regression setting where the relation via a link is of as a multi-linear function of the responsnature.
ThisUsing a multi-linear relation allows to perform per-axis reductions that drastically which reduces the total number of parameters drastically for higher order tTensor-valued predictors. WUnder the Exponential Family we derive maximum likelihood estimates for the multi-linear sufficient dimension reduction and of the Tensor-valued predictors. Furthermore, we provide a computationally efficient n estimation algorithm which leveragutilizes the tTensor structure allowing efficient implementations. The performance of the method is illustrated via simulations and real world examples are provided.
We consider regression and classification for \textit{general} response and tensor-valued predictors (multi dimensional arrays) and propose a \textit{novel formulation} for sufficient dimension reduction. Assuming the distribution of the tensor-valued predictors given the response is in the quadratic exponential family, we model the natural parameter as a multi-linear function of the respons.
This allows per-axis reductions that drastically reduce the total number of parameters for higher order tensor-valued predictors. We derive maximum likelihood estimates for the sufficient dimension reduction and a computationally efficient estimation algorithm which leveraes the tensor structure. The performance of the method is illustrated via simulations and real world examples are provided.
\end{abstract}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -523,13 +524,13 @@ The next step is to identify the Hessians from the second differentials in a sim
&\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \vec\biggl( (\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j) \Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))} \biggr) \\
&= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big)_{(j)}\otimes\mat{I}_{p_j} \biggr) \mat{K}_{\mat{p},(j)}\mat{H}_{1,1}\t{\mat{K}_{\mat{p},(l)}} \biggl( \t{\Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big)_{(l)}}\otimes\mat{I}_{p_l} \biggr)\vec{\d\mat{\alpha}_l} \\
&\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \biggl( \t{\Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))}}\otimes\mat{I}_{p_j p_l} \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\
&= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \vec{\d\mat{\alpha}_l} \\
&= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \vec{\d\mat{\alpha}_l} \\
&\qquad + c_1 \vec\biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\
&= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \vec{\d\mat{\alpha}_l} \\
&= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \vec{\d\mat{\alpha}_l} \\
&\qquad + c_1 \t{(\vec{\d\mat{\alpha}_j})} \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3))} \vec{\d\mat{\alpha}_l} \\
&\qquad \begin{aligned}
\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\alpha}_l})}} &=
-c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \\
-c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\
&\qquad + c_1 \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3) + [[j > l]])}
\qquad{\color{gray} (p_j q_j \times p_l q_l)}
\end{aligned}
@ -591,22 +592,23 @@ and with that all auxiliary calculation are done. Using the two expectations yie
\mathcal{I}_{2 r + 1,1} & \mathcal{I}_{2 r + 1,1} & \cdots & \mathcal{I}_{2 r + 1, 2 r + 1}
\end{pmatrix}
\end{displaymath}
for $j, l = 1, ..., r$ as follows
and for every block holds $\mathcal{I}_{j, l} = \t{\mathcal{I}_{l, j}}$. The individual blocks are given with $j = 1, ..., r$ and $j \leq l \leq r$ by
\begin{align*}
\mathcal{I}_{1,1} &= c_1^2 (\ten{H}_{1,1})_{([r])} \\
\mathcal{I}_{1,j+1} = \t{\mathcal{I}_{j+1,1}}
\mathcal{I}_{1,j+1} % = \E\partial_{\vec{\overline{\ten{\eta}}_1}}\partial_{\t{(\vec{\mat{\alpha}_j})}} l(\mat{\Theta})\mid \ten{Y} = y
&= c_1^2 \Big[\Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1}\Big]_{((2, 1))} \\
\mathcal{I}_{1,j+1} = \t{\mathcal{I}_{j+1,1}}
\mathcal{I}_{1,j+r+1}
&= c_1 c_2 \Big( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)} \\
\mathcal{I}_{j+1,l+1} = \t{\mathcal{I}_{l+1,j+1}}
\mathcal{I}_{j+1,l+1}
&= c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\
\mathcal{I}_{j+1,l+r+1} = \t{\mathcal{I}_{l+r+1,j+1}}
\mathcal{I}_{j+1,l+r+1}
&= c_1 c_2 \biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{((r + 2, 1))} \\
\mathcal{I}_{j+r+1,l+r+1} = \t{\mathcal{I}_{l+r+1,j+r+1}}
\mathcal{I}_{j+r+1,l+r+1}
&= c_2^2 \Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}
\end{align*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Vectorization and Matricization}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -615,7 +617,7 @@ The \emph{matricization} is a generalization of the \emph{vectorization} operati
\begin{theorem}\label{thm:mlm_mat}
Let $\ten{A}$ be a tensor of order $r$ with the dimensions $q_1\times ... \times q_r$. Furthermore, let for $k = 1, ..., r$ be $\mat{B}_k$ matrices of dimensions $p_k\times q_k$. Then, for any $(\mat{i}, \mat{j})\in\perm{r}$ holds
\begin{displaymath}
(\ten{A}\times_{k\in[r]}\mat{B}_k)_{(\mat{i}, \mat{j})}
\Big(\ten{A}\mlm{k\in[r]}\mat{B}_k\Big)_{(\mat{i}, \mat{j})}
= \Big(\bigotimes_{k = \len{\mat{i}}}^{1}\mat{B}_{\mat{i}_k}\Big) \ten{A}_{(\mat{i}, \mat{j})} \Big(\bigotimes_{k = \len{\mat{j}}}^{1}\t{\mat{B}_{\mat{j}_k}}\Big).
\end{displaymath}
\end{theorem}

View File

@ -47,6 +47,7 @@ export(mkm)
export(mlm)
export(mtvk)
export(num.deriv)
export(num.deriv.function)
export(num.deriv2)
export(reduce)
export(rowKronecker)

View File

@ -5,265 +5,6 @@ GMLM <- function(...) {
stop("Not Implemented")
}
make.gmlm.family <- function(name) {
# standardize family name
name <- list(
normal = "normal", gaussian = "normal",
bernoulli = "bernoulli", ising = "bernoulli"
)[[tolower(name), exact = FALSE]]
############################################################################
# #
# TODO: better (and possibly specialized) initial parameters!?!?!?! #
# #
############################################################################
switch(name,
normal = {
initialize <- function(X, Fy) {
p <- head(dim(X), -1)
r <- length(p)
# Mode-Covariances
XSigmas <- mcov(X, sample.axis = r + 1L)
YSigmas <- mcov(Fy, sample.axis = r + 1L)
# Extract main mode covariance directions
# Note: (the directions are transposed!)
XDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, XSigmas)
YDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, YSigmas)
alphas <- Map(function(xdir, ydir) {
s <- min(ncol(xdir), nrow(ydir))
crossprod(xdir[seq_len(s), , drop = FALSE],
ydir[seq_len(s), , drop = FALSE])
}, XDirs, YDirs)
list(
eta1 = rowMeans(X, dims = r),
alphas = alphas,
Omegas = Map(diag, p)
)
}
# parameters of the tensor normal computed from the GLM parameters
params <- function(Fy, eta1, alphas, Omegas) {
Deltas <- Map(solve, Omegas)
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
list(mu_y = mu_y, Deltas = Deltas)
}
# scaled negative log-likelihood
log.likelihood <- function(X, Fy, eta1, alphas, Omegas) {
n <- tail(dim(X), 1) # sample size
# conditional mean
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas))
# negative log-likelihood scaled by sample size
# Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)`
# migth fail, but `NAGD` has failsaves againt cases of "illegal"
# parameters.
suppressWarnings(
0.5 * prod(p) * log(2 * pi) +
sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) -
(0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p)
)
}
# gradient of the scaled negative log-likelihood
grad <- function(X, Fy, eta1, alphas, Omegas) {
# retrieve dimensions
n <- tail(dim(X), 1) # sample size
p <- head(dim(X), -1) # predictor dimensions
r <- length(p) # single predictor/response tensor order
## "Inverse" Link: Tensor Normal Specific
# known exponential family constants
c1 <- 1
c2 <- -0.5
# Covariances from the GLM parameter Scatter matrices
Deltas <- Map(solve, Omegas)
# First moment via "inverse" link `g1(eta_y) = E[X | Y = y]`
E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
# Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]`
dim(E1) <- c(prod(p), n)
E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1))
## end "Inverse" Link
dim(X) <- c(prod(p), n)
# Residuals
R <- X - E1
dim(R) <- c(p, n)
# mean deviation between the sample covariance to GLM estimated covariance
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
S <- rowMeans(colKronecker(X, X)) - E2 # <- Optimized for Tensor Normal
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
# Gradients of the negative log-likelihood scaled by sample size
list(
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
"Dl(alphas)" = Map(function(j) {
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
}, 1:r),
"Dl(Omegas)" = Map(function(j) {
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
# addapt to symmetric constraint for the derivative
dim(deriv) <- c(p[j], p[j])
deriv + t(deriv * (1 - diag(p[j])))
}, 1:r)
)
}
},
bernoulli = {
require(mvbernoulli)
initialize <- function(X, Fy) {
p <- head(dim(X), -1)
r <- length(p)
# Mode-Covariances
XSigmas <- mcov(X, sample.axis = r + 1L)
YSigmas <- mcov(Fy, sample.axis = r + 1L)
# Extract main mode covariance directions
# Note: (the directions are transposed!)
XDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, XSigmas)
YDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, YSigmas)
alphas <- Map(function(xdir, ydir) {
s <- min(ncol(xdir), nrow(ydir))
crossprod(xdir[seq_len(s), , drop = FALSE],
ydir[seq_len(s), , drop = FALSE])
}, XDirs, YDirs)
list(
eta1 = array(0, dim = p),
alphas = alphas,
Omegas = Map(diag, p)
)
}
params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
# number of observations
n <- tail(dim(Fy), 1)
# natural exponential family parameters
eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1))
eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas))
# # next the conditional Ising model parameters `theta_y`
# theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n)
# dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n)
# ltri <- which(lower.tri(eta_y2, diag = TRUE))
# diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
# theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1)
# theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ]
# conditional Ising model parameters
theta_y <- matrix(rep(vech(eta_y2), n), ncol = n)
ltri <- which(lower.tri(eta_y2, diag = TRUE))
diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
theta_y[diagonal, ] <- eta_y1
theta_y
}
# Scaled ngative log-likelihood
log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
# number of observations
n <- tail(dim(X), 1L)
# conditional Ising model parameters
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
# convert to binary data set
storage.mode(X) <- "integer"
X.mvb <- as.mvbinary(mat(X, length(dim(X))))
# log-likelihood of the data set
-mean(sapply(seq_len(n), function(i) {
ising_log_likelihood(theta_y[, i], X.mvb[i])
}))
}
# Gradient of the scaled negative log-likelihood
grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
# retrieve dimensions
n <- tail(dim(X), 1) # sample size
p <- head(dim(X), -1) # predictor dimensions
r <- length(p) # single predictor/response tensor order
## "Inverse" Link: Ising Model Specific
# conditional Ising model parameters: `p (p + 1) / 2` by `n`
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
# conditional expectations
# ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y]
E2 <- apply(theta_y, 2L, ising_marginal_probs)
# convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y]
E2 <- E2[vech.pinv.index(prod(p)), ]
# extract diagonal elements which are equal to E[vec(X) | Y = y]
E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ]
## end "Inverse" Link
dim(X) <- c(prod(p), n)
# Residuals
R <- X - E1
dim(R) <- c(p, n)
# mean deviation between the sample covariance to GLM estimated covariance
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
S <- rowMeans(colKronecker(X, X) - E2)
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
# Gradients of the negative log-likelihood scaled by sample size
list(
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
"Dl(alphas)" = Map(function(j) {
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
}, 1:r),
"Dl(Omegas)" = Map(function(j) {
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
# addapt to symmetric constraint for the derivative
dim(deriv) <- c(p[j], p[j])
deriv + t(deriv * (1 - diag(p[j])))
}, 1:r)
)
}
}
)
list(
family = name,
initialize = initialize,
params = params,
# linkinv = linkinv,
log.likelihood = log.likelihood,
grad = grad
)
}
#' @export
GMLM.default <- function(X, Fy, sample.axis = 1L,
family = "normal",
@ -292,6 +33,7 @@ GMLM.default <- function(X, Fy, sample.axis = 1L,
}
}
# optimize likelihood using Nesterov Excelerated Gradient Descent
params.fit <- NAGD(
fun.loss = function(params) {
# scaled negative lig-likelihood

View File

@ -24,7 +24,7 @@
#' matrix(1, 2, 3)
#' array(rep(1, 6), dim = c(2, 3))
#' })
#' # basicaly identical to
#' # is identical to
#' stopifnot(exprs = {
#' all.equal(matrix(rep(1, 6), 2, 3), matrix(1, 2, 3))
#' all.equal(matrix(rep(1, 6), 2, 3), array(rep(1, 6), dim = c(2, 3)))

View File

@ -0,0 +1,386 @@
make.gmlm.family <- function(name) {
# standardize family name
name <- list(
normal = "normal", gaussian = "normal",
bernoulli = "bernoulli", ising = "bernoulli"
)[[tolower(name), exact = FALSE]]
############################################################################
# #
# TODO: better (and possibly specialized) initial parameters!?!?!?! #
# #
############################################################################
switch(name,
########################################################################
### Tensor Normal ###
########################################################################
normal = {
initialize <- function(X, Fy) {
p <- head(dim(X), -1)
r <- length(p)
# Mode-Covariances
XSigmas <- mcov(X, sample.axis = r + 1L)
YSigmas <- mcov(Fy, sample.axis = r + 1L)
# Extract main mode covariance directions
# Note: (the directions are transposed!)
XDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, XSigmas)
YDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, YSigmas)
alphas <- Map(function(xdir, ydir) {
s <- min(ncol(xdir), nrow(ydir))
crossprod(xdir[seq_len(s), , drop = FALSE],
ydir[seq_len(s), , drop = FALSE])
}, XDirs, YDirs)
list(
eta1 = rowMeans(X, dims = r),
alphas = alphas,
Omegas = Map(diag, p)
)
}
# parameters of the tensor normal computed from the GLM parameters
params <- function(Fy, eta1, alphas, Omegas) {
Deltas <- Map(solve, Omegas)
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
list(mu_y = mu_y, Deltas = Deltas)
}
# scaled negative log-likelihood
log.likelihood <- function(X, Fy, eta1, alphas, Omegas) {
n <- tail(dim(X), 1) # sample size
# conditional mean
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas))
# negative log-likelihood scaled by sample size
# Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)`
# migth fail, but `NAGD` has failsaves againt cases of "illegal"
# parameters.
suppressWarnings(
0.5 * prod(p) * log(2 * pi) +
sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) -
(0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p)
)
}
# gradient of the scaled negative log-likelihood
grad <- function(X, Fy, eta1, alphas, Omegas) {
# retrieve dimensions
n <- tail(dim(X), 1) # sample size
p <- head(dim(X), -1) # predictor dimensions
r <- length(p) # single predictor/response tensor order
## "Inverse" Link: Tensor Normal Specific
# known exponential family constants
c1 <- 1
c2 <- -0.5
# Covariances from the GLM parameter Scatter matrices
Deltas <- Map(solve, Omegas)
# First moment via "inverse" link `g1(eta_y) = E[X | Y = y]`
E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
# Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]`
dim(E1) <- c(prod(p), n)
E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1))
## end "Inverse" Link
dim(X) <- c(prod(p), n)
# Residuals
R <- X - E1
dim(R) <- c(p, n)
# mean deviation between the sample covariance to GLM estimated covariance
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
S <- rowMeans(colKronecker(X, X)) - E2 # <- Optimized for Tensor Normal
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
# Gradients of the negative log-likelihood scaled by sample size
list(
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
"Dl(alphas)" = Map(function(j) {
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
}, 1:r),
"Dl(Omegas)" = Map(function(j) {
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
# addapt to symmetric constraint for the derivative
dim(deriv) <- c(p[j], p[j])
deriv + t(deriv * (1 - diag(p[j])))
}, 1:r)
)
}
# mean conditional Fisher Information
fisher.info <- function(Fy, eta1, alphas, Omegas) {
# retrieve dimensions
n <- tail(dim(Fy), 1) # sample size
p <- dim(eta1) # predictor dimensions
q <- head(dim(Fy), -1) # response dimensions
r <- length(p) # single predictor/response tensor order
# Hij = Cov(ti(X) %x% tj(X) | Y = y), i, j = 1, 2
H11 <- Reduce(`%x%`, rev(Map(solve, Omegas))) # covariance
# 3rd central moment is zero
H12 <- H21 <- 0
# 4th moment by "Isserlis' theorem" a.k.a. "Wick's theorem"
H22 <- kronecker(H11, H11)
dim(H22) <- rep(prod(p), 4)
H22 <- H22 + aperm(H22, c(1, 3, 2, 4)) + aperm(H22, c(1, 3, 2, 4))
dim(H11) <- c(p, p)
dim(H22) <- c(p, p, p, p)
## Fisher Information: Tensor Normal Specific
# known exponential family constants
c1 <- 1
c2 <- -0.5
# list of (Fy x_{k in [r]\j} alpha_j)
Gys <- Map(function(j) {
mlm(Fy, alphas[-j], (1:r)[-j])
}, 1:r)
# setup indices of lower triangular matrix elements used for
# all tuples (j, l) with 1 <= j <= l <= r.
ltri <- lower.tri(diag(r), TRUE)
J <- .col(c(r, r))[ltri]
L <- .row(c(r, r))[ltri]
# inverse perfect outer shuffle of `2 r` elements
iShuf <- rep(1:r, each = 2) + c(0, r)
# Getter for the i'th observation from Gys[[j]]
# TODO: this is ugly, find a better (but still dynamic) version
GyGet <- function(i, j) {
obs <- eval(str2lang(paste(
"Gys[[j]][",
paste(rep(",", r), collapse = ""),
"i, drop = FALSE]"
, collapse = "")))
dim(obs) <- head(dim(obs), -1)
obs
}
I11 <- c1^2 * mat(H11, 1:r)
I12 <- Map(function(j) {
i11 <- ttt(Gys[[j]], H11, (1:r)[-j])
# take mean with respect to observations (second mode)
i11 <- colMeans(mat(i11, 2))
dim(i11) <- c(q[j], p[j], p)
t(mat(i11, 2:1))
}, 1:r)
I13 <- Map(matrix, 0, prod(p), p^2)
I22 <- Map(function(j, l) {
i22 <- 0
for (i in 1:n) {
i22 <- i22 + ttt(
ttt(GyGet(i, j), H11, (1:r)[-j]),
GyGet(i, l), (1:r)[-l] + 2, (1:r)[-l])
}
(c1^2 / n) * mat(i22, 2:1)
}, J, L)
I23 <- Map(matrix, 0, (p * q)[J], (p^2)[L])
# shuffled H22
sH22 <- c2^2 * aperm(H22, c(iShuf, iShuf + 2 * r))
dim(sH22) <- c(p^2, p^2)
I33 <- Map(function(j, l) {
# second derivative (without constraints)
deriv <- mlm(mlm(sH22, Map(c, Omegas[-j]), (1:r)[-j],
transposed = TRUE),
Map(c, Omegas[-l]), (1:r)[-l] + r,
transposed = TRUE)
dim(deriv) <- c(p[j], p[j], p[l], p[l])
# enforce symmetry of `Omega_j`
of.diag <- (slice.index(deriv, 1:2) != slice.index(deriv, 2:1))
deriv <- deriv + of.diag * aperm(deriv, c(2, 1, 3, 4))
# as well as the symmetry of `Omega_l`
of.diag <- (slice.index(deriv, 3:4) != slice.index(deriv, 4:3))
deriv <- deriv + of.diag * aperm(deriv, c(1, 2, 4, 3))
# matricize and return
dim(deriv) <- c(p[j]^2, p[l]^2)
deriv
}, J, L)
names(I12) <- sprintf("I(eta1, alpha_%d)", 1:r)
names(I13) <- sprintf("I(eta1, Omega_%d)", 1:r)
names(I22) <- sprintf("I(alpha_%d, alpha_%d)", J, L)
names(I23) <- sprintf("I(alpha_%d, Omega_%d)", J, L)
names(I33) <- sprintf("I(Omega_%d, Omega_%d)", J, L)
list(I11 = I11, I12 = I12, I13 = I13, I22 = I22, I23 = I23, I33 = I33)
}
# Hessian of the scaled negative log-likelihood
hessian <- function(X, Fy, eta1, alphas, Omegas) {
stop("Not Implemented")
}
},
########################################################################
### Multi-Variate Bernoulli ###
########################################################################
bernoulli = {
require(mvbernoulli)
initialize <- function(X, Fy) {
p <- head(dim(X), -1)
r <- length(p)
# Mode-Covariances
XSigmas <- mcov(X, sample.axis = r + 1L)
YSigmas <- mcov(Fy, sample.axis = r + 1L)
# Extract main mode covariance directions
# Note: (the directions are transposed!)
XDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, XSigmas)
YDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, YSigmas)
alphas <- Map(function(xdir, ydir) {
s <- min(ncol(xdir), nrow(ydir))
crossprod(xdir[seq_len(s), , drop = FALSE],
ydir[seq_len(s), , drop = FALSE])
}, XDirs, YDirs)
list(
eta1 = array(0, dim = p),
alphas = alphas,
Omegas = Map(diag, p)
)
}
params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
# number of observations
n <- tail(dim(Fy), 1)
# natural exponential family parameters
eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1))
eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas))
# # next the conditional Ising model parameters `theta_y`
# theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n)
# dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n)
# ltri <- which(lower.tri(eta_y2, diag = TRUE))
# diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
# theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1)
# theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ]
# conditional Ising model parameters
theta_y <- matrix(rep(vech(eta_y2), n), ncol = n)
ltri <- which(lower.tri(eta_y2, diag = TRUE))
diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
theta_y[diagonal, ] <- eta_y1
theta_y
}
# Scaled ngative log-likelihood
log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
# number of observations
n <- tail(dim(X), 1L)
# conditional Ising model parameters
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
# convert to binary data set
storage.mode(X) <- "integer"
X.mvb <- as.mvbinary(mat(X, length(dim(X))))
# log-likelihood of the data set
-mean(sapply(seq_len(n), function(i) {
ising_log_likelihood(theta_y[, i], X.mvb[i])
}))
}
# Gradient of the scaled negative log-likelihood
grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
# retrieve dimensions
n <- tail(dim(X), 1) # sample size
p <- head(dim(X), -1) # predictor dimensions
r <- length(p) # single predictor/response tensor order
## "Inverse" Link: Ising Model Specific
# conditional Ising model parameters: `p (p + 1) / 2` by `n`
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
# conditional expectations
# ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y]
E2 <- apply(theta_y, 2L, ising_marginal_probs)
# convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y]
E2 <- E2[vech.pinv.index(prod(p)), ]
# extract diagonal elements which are equal to E[vec(X) | Y = y]
E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ]
## end "Inverse" Link
dim(X) <- c(prod(p), n)
# Residuals
R <- X - E1
dim(R) <- c(p, n)
# mean deviation between the sample covariance to GLM estimated covariance
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
S <- rowMeans(colKronecker(X, X) - E2)
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
# Gradients of the negative log-likelihood scaled by sample size
list(
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
"Dl(alphas)" = Map(function(j) {
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
}, 1:r),
"Dl(Omegas)" = Map(function(j) {
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
# addapt to symmetric constraint for the derivative
dim(deriv) <- c(p[j], p[j])
deriv + t(deriv * (1 - diag(p[j])))
}, 1:r)
)
}
# Hessian of the scaled negative log-likelihood
hessian <- function(X, Fy, alphas, Omegas, c1 = 1, c2 = 1) {
stop("Not Implemented")
}
# Conditional Fisher Information
fisher.info <- function(Fy, alphas, Omegas) {
stop("Not Implemented")
}
}
)
list(
family = name,
initialize = initialize,
params = params,
# linkinv = linkinv,
log.likelihood = log.likelihood,
grad = grad,
hessian = hessian,
fisher.info = fisher.info
)
}

View File

@ -15,35 +15,43 @@
#'
#' @export
matrixImage <- function(A, add.values = FALSE,
main = NULL, sub = NULL, interpolate = FALSE, ...,
main = NULL, sub = NULL, interpolate = FALSE, ..., zlim = NA,
axes = TRUE, asp = 1, col = hcl.colors(24, "YlOrRd", rev = FALSE),
digits = getOption("digits")
) {
# plot raster image
plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "red",
xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main, sub = sub)
plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "black",
xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main, sub = sub,
asp = asp)
# Scale values of `A` to [0, 1] with min mapped to 1 and max to 0.
if (missing(zlim)) {
S <- (max(A) - A) / diff(range(A))
} else {
S <- pmin(pmax(0, (zlim[2] - A) / diff(zlim)), 1)
}
# and not transform to color
S <- matrix(col[round((length(col) - 1) * S + 1)], nrow = nrow(A))
# Raster Image ploting the matrix with element values mapped to grayscale
# as big elements (original matrix A) are dark and small (negative) elements
# are white.
rasterImage(S, 0, 0, ncol(A), nrow(A), interpolate = interpolate, ...)
# Add X-axis giving index
# X/Y axes index (matches coordinates to matrix indices)
x <- seq(1, ncol(A), by = 1)
axis(1, at = x - 0.5, labels = x, lwd = 0, lwd.ticks = 1)
# Add Y-axis
y <- seq(1, nrow(A))
if (axes) {
axis(1, at = x - 0.5, labels = x, lwd = 0, lwd.ticks = 1)
axis(2, at = y - 0.5, labels = rev(y), lwd = 0, lwd.ticks = 1, las = 1)
}
# Writes matrix values (in colored element grids)
# Writes matrix values
if (any(add.values)) {
if (length(add.values) > 1) {
A[!add.values] <- NA
A[add.values] <- format(A[add.values], digits = digits)
}
text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A,
adj = 0.5, col = as.integer(S > 0.65))
text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A, adj = 0.5)
}
}

View File

@ -3,34 +3,101 @@
#' @example inst/examples/num_deriv.R
#'
#' @export
num.deriv <- function(F, X, h = 1e-6, sym = FALSE) {
num.deriv <- function(expr, ..., h = 1e-6, sym = FALSE) {
sexpr <- substitute(expr)
if (...length() != 1) {
stop("Expectd exactly one '...' variable")
}
var <- ...names()[1]
if (is.null(var)) {
arg <- substitute(...)
var <- if (is.symbol(arg)) as.character(arg) else "x"
}
if (is.language(sexpr) && !is.symbol(sexpr) && sexpr[[1]] == as.symbol("function")) {
func <- expr
} else {
if (is.name(sexpr)) {
expr <- call(as.character(sexpr), as.name(var))
} else {
if ((!is.call(sexpr) && !is.expression(sexpr))
|| !(var %in% all.vars(sexpr))) {
stop("'expr' must be a function or expression containing '", var, "'")
}
expr <- sexpr
}
args <- as.pairlist(structure(list(alist(x = )[[1]]), names = var))
func <- as.function(c(args, expr), envir = parent.frame())
}
num.deriv.function(func, ..1, h = h, sym = sym)
}
#' @rdname num.deriv
#' @export
num.deriv.function <- function(FUN, X, h = 1e-6, sym = FALSE) {
if (sym) {
stopifnot(isSymmetric(X))
p <- nrow(X)
k <- seq_along(X) - 1
mapply(function(i, j) {
dx <- h * ((k == i * p + j) | (k == j * p + i))
(F(X + dx) - F(X - dx)) / (2 * h)
(FUN(X + dx) - FUN(X - dx)) / (2 * h)
}, .row(dim(X)) - 1, .col(dim(X)) - 1)
} else {
sapply(seq_along(X), function(i) {
dx <- h * (seq_along(X) == i)
(F(X + dx) - F(X - dx)) / (2 * h)
(FUN(X + dx) - FUN(X - dx)) / (2 * h)
})
}
}
#' Second numeric derivative
#'
#' @rdname num.deriv
#' @export
num.deriv2 <- function(F, X, Y, h = 1e-6, symX = FALSE, symY = FALSE) {
num.deriv2 <- function(FUN, X, Y, h = 1e-6, symX = FALSE, symY = FALSE) {
if (missing(Y)) {
num.deriv(function(x) {
num.deriv(function(z) F(z), x, h = h, sym = symX)
num.deriv.function(function(x) {
num.deriv.function(FUN, x, h = h, sym = symX)
}, X, h = h, sym = symX)
} else {
num.deriv(function(y) {
num.deriv(function(x) F(x, y), X, h = h, sym = symX)
num.deriv.function(function(y) {
num.deriv.function(function(x) FUN(x, y), X, h = h, sym = symX)
}, Y, h = h, sym = symY)
}
}
### Interface Idea / Demo
# num.deriv2.function
# num.deriv2 <- function(expr, var) {
# sexpr <- substitute(expr)
# svar <- substitute(var)
#
# if (is.language(sexpr) && !is.symbol(sexpr) && sexpr[[1]] == as.symbol("function")) {
# func <- expr
# } else {
# if (is.name(sexpr)) {
# expr <- call(as.character(sexpr), as.name(svar))
# } else {
# if ((!is.call(sexpr) && !is.expression(sexpr))
# || !(as.character(svar) %in% all.vars(sexpr))) {
# stop("'expr' must be a function or expression containing '",
# as.character(svar), "'")
# }
# expr <- sexpr
# }
#
# args <- as.pairlist(structure(list(alist(x = )[[1]]), names = as.character(svar)))
# func <- as.function(c(args, expr), envir = parent.frame())
# }
#
# num.deriv2.function(func, var)
# }
# y <- c(pi, exp(1), (sqrt(5) + 1) / 2)
# num.deriv2(function(x) x^2 + y, x)(1:3)
# num.deriv2(x^2 + y, x)(1:3)
# func <- function(x) x^2 + y
# num.deriv2(func, x)(1:3)
# func2 <- function(z = y, x) x^2 + z
# num.deriv2(func2, x)(1:3)

View File

@ -4,15 +4,17 @@ A <- A + t(A)
B <- matrix(rnorm(3 * 4), 3, 4)
# F(A) = A B
# DF(A) = B' x I
stopifnot(all.equal(
num.deriv(function(X) X %*% B, A),
# DF(A) = B' %x% I
exprs.all.equal({
t(B) %x% diag(nrow(A))
))
num.deriv(function(X) X %*% B, A)
num.deriv(A %*% B, A)
num.deriv(X %*% B, X = A)
})
# Symmetric case, constraint A = A' (equiv to being a function of vech(A) only)
# F(A) = A B for A = A'
# DF(A) = B' x I
# DF(A) = B' %x% I
stopifnot(all.equal(
num.deriv(function(X) X %*% B, A, sym = TRUE),
(t(B) %x% diag(nrow(A))) %*% D(nrow(A)) %*% t(D(nrow(A)))

View File

@ -0,0 +1,4 @@
library(testthat)
library(tensorPredictors)
test_check("tensorPredictors")

View File

@ -0,0 +1,50 @@
# F(X) = log(det(X)) if det(X) > 0
# dF(X) = tr(X^-1 dX)
# DF(X) = vec((X^-1)')'
test_that("Matrix Calculus 1", {
# test data
X <- tcrossprod(diag(5) + matrix(runif(5^2, -.1, .1), 5))
num.grad <- num.deriv(function(X) log(det(X)), X)
ana.grad <- c(solve(X))
expect_equal(num.grad, ana.grad)
})
# F(mu) = <X - mu, (X - mu) x_{k in [r]} Delta_K^-1> for Delta_k = Delta_k'
# DF(mu) = -2 vec((X - mu) x_{k in [r]} Delta_K^-1)'
test_that("Matrix Calculus 2", {
p <- c(2, 3, 5)
X <- array(rnorm(prod(p)), dim = p)
mu <- array(rnorm(prod(p)), dim = p)
Deltas <- Map(function(pj) tcrossprod(diag(pj) + runif(pj^2, -0.1, 0.1)), p)
ana.grad <- -2 * c(mlm(X - mu, Map(solve, Deltas)))
num.grad <- num.deriv(function(mu) sum((X - mu) * mlm(X - mu, Map(solve, Deltas))), mu)
expect_equal(num.grad, ana.grad)
})
# F(Delta_j) = <X - mu, (X - mu) x_{k in [r]} Delta_K^-1> for Delta_k = Delta_k'
# DF(Delta_j) = -((X - mu) x_{k in [r]} Delta_K^-1)_(j) (X - mu)_(j)' Delta_j^-1
test_that("Matrix Calculus 3", {
# config
p <- c(2, 3, 5)
# generate test data
X <- array(rnorm(prod(p)), dim = p)
mu <- array(rnorm(prod(p)), dim = p)
Deltas <- Map(function(pj) tcrossprod(diag(pj) + runif(pj^2, -0.1, 0.1)), p)
# check analytic to numeric derivatives
for (j in seq_along(Deltas)) {
num.grad <- matrix(num.deriv(function(Delta_j) {
Deltas[[j]] <- Delta_j
sum((X - mu) * mlm(X - mu, Map(solve, Deltas)))
}, Deltas[[j]]), p[j])
ana.grad <- -mcrossprod(mlm(X - mu, Map(solve, Deltas)), X - mu, j) %*% solve(Deltas[[j]])
expect_equal(num.grad, ana.grad)
}
})

View File

@ -0,0 +1,186 @@
test_that("Tensor-Normal Score", {
# setup dimensions
n <- 11L
p <- c(3L, 5L, 4L)
q <- c(2L, 7L, 3L)
# create "true" GLM parameters
eta1 <- array(rnorm(prod(p)), dim = p)
alphas <- Map(matrix, Map(rnorm, p * q), p)
Omegas <- Map(function(pj) {
solve(0.5^abs(outer(seq_len(pj), seq_len(pj), `-`)))
}, p)
params <- list(eta1, alphas, Omegas)
# compute tensor normal parameters from GLM parameters
Deltas <- Map(solve, Omegas)
mu <- mlm(eta1, Deltas)
# sample some test data
sample.axis <- length(p) + 1L
Fy <- array(rnorm(n * prod(q)), dim = c(q, n))
X <- mlm(Fy, Map(`%*%`, Deltas, alphas)) + rtensornorm(n, mu, Deltas, sample.axis)
# Create a GLM family object
family <- make.gmlm.family("normal")
# first unpack the family object
log.likelihood <- family$log.likelihood
grad <- family$grad
# compare numeric gradient against the analytic gradient provided by the
# family at the initial parameters
grad.num <- do.call(function(eta1, alphas, Omegas) {
list(
"Dl(eta1)" = num.deriv.function(function(eta1) {
log.likelihood(X, Fy, eta1, alphas, Omegas)
}, eta1),
"Dl(alphas)" = Map(function(j) num.deriv.function(function(alpha_j) {
alphas[[j]] <- alpha_j
log.likelihood(X, Fy, eta1, alphas, Omegas)
}, alphas[[j]]), seq_along(alphas)),
"Dl(Omegas)" = Map(function(j) num.deriv.function(function(Omega_j) {
Omegas[[j]] <- Omega_j
log.likelihood(X, Fy, eta1, alphas, Omegas)
}, Omegas[[j]], sym = TRUE), seq_along(Omegas))
)
}, params)
grad.ana <- do.call(grad, c(list(X, Fy), params))
expect_equal(RMap(c, grad.ana), grad.num, tolerance = 1e-6)
# test for correct dimensions
ana.dims <- list(
"Dl(eta1)" = p,
"Dl(alphas)" = Map(c, p, q),
"Dl(Omegas)" = Map(c, p, p)
)
expect_identical(RMap(dim, grad.ana), ana.dims)
})
test_that("Tensor-Normal moments of the sufficient statistic t(X)", {
# config
# # (estimated) sample size for sample estimates
# n <- 250000 # sample estimates are too unreliable for testing purposes
p <- c(2L, 3L)
r <- length(p)
# setup tensor normal parameters (NO dependence of Fy, set to zero)
mu <- array(runif(prod(p), min = -0.5, max = 0.5), dim = p) # = mu_y
Deltas <- Map(function(pj) 0.5^abs(outer(1:pj, 1:pj, `-`)), p)
# compute GMLM parameters (NO alphas as Fy is zero)
Omegas <- Map(solve, Deltas)
eta1 <- mlm(mu, Omegas)
# natural parameters of the tensor normal
eta_y1 <- eta1 # + mlm(Fy, alphas) = + 0
eta_y2 <- -0.5 * Reduce(`%x%`, rev(Omegas))
# draw a sample
X <- rtensornorm(n, mu, Deltas, r + 1L)
# tensor normal log-partition function given eta_y
log.partition <- function(eta_y1, eta_y2) {
# eta_y1 as "model" matrix of dimensions `n by p` where `n` might be `1`
if (length(eta_y1)^2 == length(eta_y2)) {
pp <- length(eta_y1)
dim(eta_y1) <- c(1L, pp)
} else {
eta_y1 <- mat(eta_y1, r + 1L)
pp <- ncol(eta_y1)
}
# treat eta_y2 as square matrix
if (!is.matrix(eta_y2)) {
dim(eta_y2) <- c(pp, pp)
}
# evaluate log-partiton function in terms of natural parameters
-0.25 * pp * mean((eta_y1 %*% solve(eta_y2)) * eta_y1) - 0.5 * log(det(-2 * eta_y2))
}
# (analytic) first and second (un-centered) moment of the tensor normal
# Eti = E[ti(X) | Y = y]
Et1.ana <- mu
Et2.ana <- Reduce(`%x%`, rev(Deltas)) + outer(c(mu), c(mu))
# (numeric) derivative of the log-partition function with respect to the natural
# parameters of the exponential family form of the tensor normal
Et1.num <- num.deriv(log.partition(eta_y1, eta_y2), eta_y1)
Et2.num <- num.deriv(log.partition(eta_y1, eta_y2), eta_y2)
# # (estimated) estimate from sample
# Et1.est <- rowMeans(X, dims = r)
# Et2.est <- colMeans(rowKronecker(mat(X, r + 1L), mat(X, r + 1L)))
# (analytic) convariance blocks of the sufficient statistic
# Ctij = Cov(ti(X), tj(X) | Y = y)
Ct11.ana <- Reduce(`%x%`, rev(Deltas))
Ct12.ana <- local({
# Analytic solution / `vec(mu %x% Delta) + vec(mu' %x% Delta)`
ct12 <- 2 * c(mu) %x% Reduce(`%x%`, rev(Deltas))
# and symmetrize!
dim(ct12) <- rep(prod(p), 3)
(1 / 2) * (ct12 + aperm(ct12, c(3, 1, 2)))
})
Ct22.ana <- local({
Delta <- Reduce(`%x%`, rev(Deltas)) # Ct11.ana
ct22 <- (2 * Delta + 4 * outer(c(mu), c(mu))) %x% Delta
# TODO: What does this symmetrization do exactly? And why?!
dim(ct22) <- rep(prod(p), 4)
(1 / 4) * (
aperm(ct22, c(2, 3, 1, 4)) +
aperm(ct22, c(2, 3, 4, 1)) +
aperm(ct22, c(3, 2, 1, 4)) +
aperm(ct22, c(3, 2, 4, 1))
)
})
# (numeric) second derivative of the log-partition function
Ct11.num <- num.deriv2(function(eta_y1) log.partition(eta_y1, eta_y2), eta_y1)
Ct12.num <- num.deriv2(log.partition, eta_y1, eta_y2)
Ct22.num <- local({
ct22 <- num.deriv2(function(eta_y2) log.partition(eta_y1, eta_y2), eta_y2)
# TODO: Why does this need to be symmetrized?!
dim(ct22) <- rep(prod(p), 4)
(1 / 4) * (
aperm(ct22, c(3, 4, 2, 1)) +
aperm(ct22, c(4, 3, 2, 1)) +
aperm(ct22, c(3, 4, 1, 2)) +
aperm(ct22, c(4, 3, 1, 2))
)
})
# # (estimated) sample estimates of the sufficient statistic convariance blocks
# T1 <- mat(X, r + 1L)
# T2 <- rowKronecker(T1, T1)
# Ct11.est <- cov(T1)
# Ct12.est <- cov(T1, T2)
# Ct22.est <- cov(T2, T2)
tolerance <- 1e-3
expect_equal(c(Et1.ana), c(Et1.num), tolerance = tolerance)
# expect_equal(c(Et1.ana), c(Et1.est), tolerance = 0.05, scale = 1)
# expect_equal(c(Et1.num), c(Et1.est), tolerance = 0.05, scale = 1)
expect_equal(c(Et2.ana), c(Et2.num), tolerance = tolerance)
# expect_equal(c(Et2.ana), c(Et2.est), tolerance = 0.05, scale = 1)
# expect_equal(c(Et2.num), c(Et2.est), tolerance = 0.05, scale = 1)
expect_equal(c(Ct11.ana), c(Ct11.num), tolerance = tolerance)
# expect_equal(c(Ct11.ana), c(Ct11.est), tolerance = 0.05, scale = 1)
# expect_equal(c(Ct11.num), c(Ct11.est), tolerance = 0.05, scale = 1)
expect_equal(c(Ct12.ana), c(Ct12.num), tolerance = tolerance)
# expect_equal(c(Ct12.ana), c(Ct12.est), tolerance = 0.05, scale = 1)
# expect_equal(c(Ct12.num), c(Ct12.est), tolerance = 0.05, scale = 1)
expect_equal(c(Ct22.ana), c(Ct22.num), tolerance = tolerance)
# expect_equal(c(Ct22.ana), c(Ct22.est), tolerance = 0.05, scale = 1)
# expect_equal(c(Ct22.num), c(Ct22.est), tolerance = 0.05, scale = 1)
})