fix: some typos in GMLM.tex,
add: extended interface of num.deriv and matrixImage, add: "tests", add: moved make.gmlm.family into seperate file
This commit is contained in:
parent
79f16c58dc
commit
2dfdc7083b
|
@ -139,11 +139,12 @@
|
|||
|
||||
\maketitle
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\begin{abstract}
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
We consider regression and classification for \textit{general} response and tensor-valued predictors (multi dimensional arrays) and propose a \textit{novel formulation} for method for sufficient dimension reduction. Assumingof Tensor-valued predictor (multi dimensional arrays) for regression or classification. the distribution of the tensor-valued predictors given the response is in theWe assume an qQuadratic eExponential fFamily, we model the natural parameter for a Generalized Linear Model in an inverse regression setting where the relation via a link is of as a multi-linear function of the responsnature.
|
||||
ThisUsing a multi-linear relation allows to perform per-axis reductions that drastically which reduces the total number of parameters drastically for higher order tTensor-valued predictors. WUnder the Exponential Family we derive maximum likelihood estimates for the multi-linear sufficient dimension reduction and of the Tensor-valued predictors. Furthermore, we provide a computationally efficient n estimation algorithm which leveragutilizes the tTensor structure allowing efficient implementations. The performance of the method is illustrated via simulations and real world examples are provided.
|
||||
We consider regression and classification for \textit{general} response and tensor-valued predictors (multi dimensional arrays) and propose a \textit{novel formulation} for sufficient dimension reduction. Assuming the distribution of the tensor-valued predictors given the response is in the quadratic exponential family, we model the natural parameter as a multi-linear function of the respons.
|
||||
This allows per-axis reductions that drastically reduce the total number of parameters for higher order tensor-valued predictors. We derive maximum likelihood estimates for the sufficient dimension reduction and a computationally efficient estimation algorithm which leveraes the tensor structure. The performance of the method is illustrated via simulations and real world examples are provided.
|
||||
\end{abstract}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
@ -523,13 +524,13 @@ The next step is to identify the Hessians from the second differentials in a sim
|
|||
&\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \vec\biggl( (\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j) \Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))} \biggr) \\
|
||||
&= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big)_{(j)}\otimes\mat{I}_{p_j} \biggr) \mat{K}_{\mat{p},(j)}\mat{H}_{1,1}\t{\mat{K}_{\mat{p},(l)}} \biggl( \t{\Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big)_{(l)}}\otimes\mat{I}_{p_l} \biggr)\vec{\d\mat{\alpha}_l} \\
|
||||
&\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \biggl( \t{\Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))}}\otimes\mat{I}_{p_j p_l} \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\
|
||||
&= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \vec{\d\mat{\alpha}_l} \\
|
||||
&= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \vec{\d\mat{\alpha}_l} \\
|
||||
&\qquad + c_1 \vec\biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\
|
||||
&= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \vec{\d\mat{\alpha}_l} \\
|
||||
&= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \vec{\d\mat{\alpha}_l} \\
|
||||
&\qquad + c_1 \t{(\vec{\d\mat{\alpha}_j})} \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3))} \vec{\d\mat{\alpha}_l} \\
|
||||
&\qquad \begin{aligned}
|
||||
\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\alpha}_l})}} &=
|
||||
-c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \\
|
||||
-c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\
|
||||
&\qquad + c_1 \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3) + [[j > l]])}
|
||||
\qquad{\color{gray} (p_j q_j \times p_l q_l)}
|
||||
\end{aligned}
|
||||
|
@ -591,22 +592,23 @@ and with that all auxiliary calculation are done. Using the two expectations yie
|
|||
\mathcal{I}_{2 r + 1,1} & \mathcal{I}_{2 r + 1,1} & \cdots & \mathcal{I}_{2 r + 1, 2 r + 1}
|
||||
\end{pmatrix}
|
||||
\end{displaymath}
|
||||
for $j, l = 1, ..., r$ as follows
|
||||
and for every block holds $\mathcal{I}_{j, l} = \t{\mathcal{I}_{l, j}}$. The individual blocks are given with $j = 1, ..., r$ and $j \leq l \leq r$ by
|
||||
\begin{align*}
|
||||
\mathcal{I}_{1,1} &= c_1^2 (\ten{H}_{1,1})_{([r])} \\
|
||||
\mathcal{I}_{1,j+1} = \t{\mathcal{I}_{j+1,1}}
|
||||
\mathcal{I}_{1,j+1} % = \E\partial_{\vec{\overline{\ten{\eta}}_1}}\partial_{\t{(\vec{\mat{\alpha}_j})}} l(\mat{\Theta})\mid \ten{Y} = y
|
||||
&= c_1^2 \Big[\Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1}\Big]_{((2, 1))} \\
|
||||
\mathcal{I}_{1,j+1} = \t{\mathcal{I}_{j+1,1}}
|
||||
\mathcal{I}_{1,j+r+1}
|
||||
&= c_1 c_2 \Big( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)} \\
|
||||
\mathcal{I}_{j+1,l+1} = \t{\mathcal{I}_{l+1,j+1}}
|
||||
\mathcal{I}_{j+1,l+1}
|
||||
&= c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\
|
||||
\mathcal{I}_{j+1,l+r+1} = \t{\mathcal{I}_{l+r+1,j+1}}
|
||||
\mathcal{I}_{j+1,l+r+1}
|
||||
&= c_1 c_2 \biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{((r + 2, 1))} \\
|
||||
\mathcal{I}_{j+r+1,l+r+1} = \t{\mathcal{I}_{l+r+1,j+r+1}}
|
||||
\mathcal{I}_{j+r+1,l+r+1}
|
||||
&= c_2^2 \Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}
|
||||
\end{align*}
|
||||
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Vectorization and Matricization}
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
@ -615,7 +617,7 @@ The \emph{matricization} is a generalization of the \emph{vectorization} operati
|
|||
\begin{theorem}\label{thm:mlm_mat}
|
||||
Let $\ten{A}$ be a tensor of order $r$ with the dimensions $q_1\times ... \times q_r$. Furthermore, let for $k = 1, ..., r$ be $\mat{B}_k$ matrices of dimensions $p_k\times q_k$. Then, for any $(\mat{i}, \mat{j})\in\perm{r}$ holds
|
||||
\begin{displaymath}
|
||||
(\ten{A}\times_{k\in[r]}\mat{B}_k)_{(\mat{i}, \mat{j})}
|
||||
\Big(\ten{A}\mlm{k\in[r]}\mat{B}_k\Big)_{(\mat{i}, \mat{j})}
|
||||
= \Big(\bigotimes_{k = \len{\mat{i}}}^{1}\mat{B}_{\mat{i}_k}\Big) \ten{A}_{(\mat{i}, \mat{j})} \Big(\bigotimes_{k = \len{\mat{j}}}^{1}\t{\mat{B}_{\mat{j}_k}}\Big).
|
||||
\end{displaymath}
|
||||
\end{theorem}
|
||||
|
|
|
@ -47,6 +47,7 @@ export(mkm)
|
|||
export(mlm)
|
||||
export(mtvk)
|
||||
export(num.deriv)
|
||||
export(num.deriv.function)
|
||||
export(num.deriv2)
|
||||
export(reduce)
|
||||
export(rowKronecker)
|
||||
|
|
|
@ -5,265 +5,6 @@ GMLM <- function(...) {
|
|||
stop("Not Implemented")
|
||||
}
|
||||
|
||||
|
||||
make.gmlm.family <- function(name) {
|
||||
# standardize family name
|
||||
name <- list(
|
||||
normal = "normal", gaussian = "normal",
|
||||
bernoulli = "bernoulli", ising = "bernoulli"
|
||||
)[[tolower(name), exact = FALSE]]
|
||||
|
||||
############################################################################
|
||||
# #
|
||||
# TODO: better (and possibly specialized) initial parameters!?!?!?! #
|
||||
# #
|
||||
############################################################################
|
||||
|
||||
switch(name,
|
||||
normal = {
|
||||
initialize <- function(X, Fy) {
|
||||
p <- head(dim(X), -1)
|
||||
r <- length(p)
|
||||
|
||||
# Mode-Covariances
|
||||
XSigmas <- mcov(X, sample.axis = r + 1L)
|
||||
YSigmas <- mcov(Fy, sample.axis = r + 1L)
|
||||
|
||||
# Extract main mode covariance directions
|
||||
# Note: (the directions are transposed!)
|
||||
XDirs <- Map(function(Sigma) {
|
||||
SVD <- La.svd(Sigma, nu = 0)
|
||||
sqrt(SVD$d) * SVD$vt
|
||||
}, XSigmas)
|
||||
YDirs <- Map(function(Sigma) {
|
||||
SVD <- La.svd(Sigma, nu = 0)
|
||||
sqrt(SVD$d) * SVD$vt
|
||||
}, YSigmas)
|
||||
|
||||
alphas <- Map(function(xdir, ydir) {
|
||||
s <- min(ncol(xdir), nrow(ydir))
|
||||
crossprod(xdir[seq_len(s), , drop = FALSE],
|
||||
ydir[seq_len(s), , drop = FALSE])
|
||||
}, XDirs, YDirs)
|
||||
|
||||
list(
|
||||
eta1 = rowMeans(X, dims = r),
|
||||
alphas = alphas,
|
||||
Omegas = Map(diag, p)
|
||||
)
|
||||
}
|
||||
|
||||
# parameters of the tensor normal computed from the GLM parameters
|
||||
params <- function(Fy, eta1, alphas, Omegas) {
|
||||
Deltas <- Map(solve, Omegas)
|
||||
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
|
||||
list(mu_y = mu_y, Deltas = Deltas)
|
||||
}
|
||||
|
||||
# scaled negative log-likelihood
|
||||
log.likelihood <- function(X, Fy, eta1, alphas, Omegas) {
|
||||
n <- tail(dim(X), 1) # sample size
|
||||
|
||||
# conditional mean
|
||||
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas))
|
||||
|
||||
# negative log-likelihood scaled by sample size
|
||||
# Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)`
|
||||
# migth fail, but `NAGD` has failsaves againt cases of "illegal"
|
||||
# parameters.
|
||||
suppressWarnings(
|
||||
0.5 * prod(p) * log(2 * pi) +
|
||||
sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) -
|
||||
(0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p)
|
||||
)
|
||||
}
|
||||
# gradient of the scaled negative log-likelihood
|
||||
grad <- function(X, Fy, eta1, alphas, Omegas) {
|
||||
# retrieve dimensions
|
||||
n <- tail(dim(X), 1) # sample size
|
||||
p <- head(dim(X), -1) # predictor dimensions
|
||||
r <- length(p) # single predictor/response tensor order
|
||||
|
||||
## "Inverse" Link: Tensor Normal Specific
|
||||
# known exponential family constants
|
||||
c1 <- 1
|
||||
c2 <- -0.5
|
||||
|
||||
# Covariances from the GLM parameter Scatter matrices
|
||||
Deltas <- Map(solve, Omegas)
|
||||
|
||||
# First moment via "inverse" link `g1(eta_y) = E[X | Y = y]`
|
||||
E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
|
||||
|
||||
# Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]`
|
||||
dim(E1) <- c(prod(p), n)
|
||||
E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1))
|
||||
## end "Inverse" Link
|
||||
|
||||
dim(X) <- c(prod(p), n)
|
||||
|
||||
# Residuals
|
||||
R <- X - E1
|
||||
dim(R) <- c(p, n)
|
||||
|
||||
# mean deviation between the sample covariance to GLM estimated covariance
|
||||
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
|
||||
S <- rowMeans(colKronecker(X, X)) - E2 # <- Optimized for Tensor Normal
|
||||
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
|
||||
|
||||
# Gradients of the negative log-likelihood scaled by sample size
|
||||
list(
|
||||
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
|
||||
"Dl(alphas)" = Map(function(j) {
|
||||
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
|
||||
}, 1:r),
|
||||
"Dl(Omegas)" = Map(function(j) {
|
||||
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
|
||||
# addapt to symmetric constraint for the derivative
|
||||
dim(deriv) <- c(p[j], p[j])
|
||||
deriv + t(deriv * (1 - diag(p[j])))
|
||||
}, 1:r)
|
||||
)
|
||||
}
|
||||
},
|
||||
bernoulli = {
|
||||
require(mvbernoulli)
|
||||
|
||||
initialize <- function(X, Fy) {
|
||||
p <- head(dim(X), -1)
|
||||
r <- length(p)
|
||||
|
||||
# Mode-Covariances
|
||||
XSigmas <- mcov(X, sample.axis = r + 1L)
|
||||
YSigmas <- mcov(Fy, sample.axis = r + 1L)
|
||||
|
||||
# Extract main mode covariance directions
|
||||
# Note: (the directions are transposed!)
|
||||
XDirs <- Map(function(Sigma) {
|
||||
SVD <- La.svd(Sigma, nu = 0)
|
||||
sqrt(SVD$d) * SVD$vt
|
||||
}, XSigmas)
|
||||
YDirs <- Map(function(Sigma) {
|
||||
SVD <- La.svd(Sigma, nu = 0)
|
||||
sqrt(SVD$d) * SVD$vt
|
||||
}, YSigmas)
|
||||
|
||||
alphas <- Map(function(xdir, ydir) {
|
||||
s <- min(ncol(xdir), nrow(ydir))
|
||||
crossprod(xdir[seq_len(s), , drop = FALSE],
|
||||
ydir[seq_len(s), , drop = FALSE])
|
||||
}, XDirs, YDirs)
|
||||
|
||||
list(
|
||||
eta1 = array(0, dim = p),
|
||||
alphas = alphas,
|
||||
Omegas = Map(diag, p)
|
||||
)
|
||||
}
|
||||
|
||||
params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
|
||||
# number of observations
|
||||
n <- tail(dim(Fy), 1)
|
||||
|
||||
# natural exponential family parameters
|
||||
eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1))
|
||||
eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas))
|
||||
|
||||
# # next the conditional Ising model parameters `theta_y`
|
||||
# theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n)
|
||||
# dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n)
|
||||
|
||||
# ltri <- which(lower.tri(eta_y2, diag = TRUE))
|
||||
# diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
|
||||
# theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1)
|
||||
# theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ]
|
||||
|
||||
# conditional Ising model parameters
|
||||
theta_y <- matrix(rep(vech(eta_y2), n), ncol = n)
|
||||
ltri <- which(lower.tri(eta_y2, diag = TRUE))
|
||||
diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
|
||||
theta_y[diagonal, ] <- eta_y1
|
||||
|
||||
theta_y
|
||||
}
|
||||
|
||||
# Scaled ngative log-likelihood
|
||||
log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
|
||||
# number of observations
|
||||
n <- tail(dim(X), 1L)
|
||||
|
||||
# conditional Ising model parameters
|
||||
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
|
||||
|
||||
# convert to binary data set
|
||||
storage.mode(X) <- "integer"
|
||||
X.mvb <- as.mvbinary(mat(X, length(dim(X))))
|
||||
|
||||
# log-likelihood of the data set
|
||||
-mean(sapply(seq_len(n), function(i) {
|
||||
ising_log_likelihood(theta_y[, i], X.mvb[i])
|
||||
}))
|
||||
}
|
||||
|
||||
# Gradient of the scaled negative log-likelihood
|
||||
grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
|
||||
# retrieve dimensions
|
||||
n <- tail(dim(X), 1) # sample size
|
||||
p <- head(dim(X), -1) # predictor dimensions
|
||||
r <- length(p) # single predictor/response tensor order
|
||||
|
||||
## "Inverse" Link: Ising Model Specific
|
||||
# conditional Ising model parameters: `p (p + 1) / 2` by `n`
|
||||
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
|
||||
|
||||
# conditional expectations
|
||||
# ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y]
|
||||
E2 <- apply(theta_y, 2L, ising_marginal_probs)
|
||||
# convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y]
|
||||
E2 <- E2[vech.pinv.index(prod(p)), ]
|
||||
# extract diagonal elements which are equal to E[vec(X) | Y = y]
|
||||
E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ]
|
||||
## end "Inverse" Link
|
||||
|
||||
dim(X) <- c(prod(p), n)
|
||||
|
||||
# Residuals
|
||||
R <- X - E1
|
||||
dim(R) <- c(p, n)
|
||||
|
||||
# mean deviation between the sample covariance to GLM estimated covariance
|
||||
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
|
||||
S <- rowMeans(colKronecker(X, X) - E2)
|
||||
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
|
||||
|
||||
# Gradients of the negative log-likelihood scaled by sample size
|
||||
list(
|
||||
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
|
||||
"Dl(alphas)" = Map(function(j) {
|
||||
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
|
||||
}, 1:r),
|
||||
"Dl(Omegas)" = Map(function(j) {
|
||||
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
|
||||
# addapt to symmetric constraint for the derivative
|
||||
dim(deriv) <- c(p[j], p[j])
|
||||
deriv + t(deriv * (1 - diag(p[j])))
|
||||
}, 1:r)
|
||||
)
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
list(
|
||||
family = name,
|
||||
initialize = initialize,
|
||||
params = params,
|
||||
# linkinv = linkinv,
|
||||
log.likelihood = log.likelihood,
|
||||
grad = grad
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
#' @export
|
||||
GMLM.default <- function(X, Fy, sample.axis = 1L,
|
||||
family = "normal",
|
||||
|
@ -292,6 +33,7 @@ GMLM.default <- function(X, Fy, sample.axis = 1L,
|
|||
}
|
||||
}
|
||||
|
||||
# optimize likelihood using Nesterov Excelerated Gradient Descent
|
||||
params.fit <- NAGD(
|
||||
fun.loss = function(params) {
|
||||
# scaled negative lig-likelihood
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
#' matrix(1, 2, 3)
|
||||
#' array(rep(1, 6), dim = c(2, 3))
|
||||
#' })
|
||||
#' # basicaly identical to
|
||||
#' # is identical to
|
||||
#' stopifnot(exprs = {
|
||||
#' all.equal(matrix(rep(1, 6), 2, 3), matrix(1, 2, 3))
|
||||
#' all.equal(matrix(rep(1, 6), 2, 3), array(rep(1, 6), dim = c(2, 3)))
|
||||
|
|
|
@ -0,0 +1,386 @@
|
|||
|
||||
make.gmlm.family <- function(name) {
|
||||
# standardize family name
|
||||
name <- list(
|
||||
normal = "normal", gaussian = "normal",
|
||||
bernoulli = "bernoulli", ising = "bernoulli"
|
||||
)[[tolower(name), exact = FALSE]]
|
||||
|
||||
############################################################################
|
||||
# #
|
||||
# TODO: better (and possibly specialized) initial parameters!?!?!?! #
|
||||
# #
|
||||
############################################################################
|
||||
|
||||
switch(name,
|
||||
########################################################################
|
||||
### Tensor Normal ###
|
||||
########################################################################
|
||||
normal = {
|
||||
initialize <- function(X, Fy) {
|
||||
p <- head(dim(X), -1)
|
||||
r <- length(p)
|
||||
|
||||
# Mode-Covariances
|
||||
XSigmas <- mcov(X, sample.axis = r + 1L)
|
||||
YSigmas <- mcov(Fy, sample.axis = r + 1L)
|
||||
|
||||
# Extract main mode covariance directions
|
||||
# Note: (the directions are transposed!)
|
||||
XDirs <- Map(function(Sigma) {
|
||||
SVD <- La.svd(Sigma, nu = 0)
|
||||
sqrt(SVD$d) * SVD$vt
|
||||
}, XSigmas)
|
||||
YDirs <- Map(function(Sigma) {
|
||||
SVD <- La.svd(Sigma, nu = 0)
|
||||
sqrt(SVD$d) * SVD$vt
|
||||
}, YSigmas)
|
||||
|
||||
alphas <- Map(function(xdir, ydir) {
|
||||
s <- min(ncol(xdir), nrow(ydir))
|
||||
crossprod(xdir[seq_len(s), , drop = FALSE],
|
||||
ydir[seq_len(s), , drop = FALSE])
|
||||
}, XDirs, YDirs)
|
||||
|
||||
list(
|
||||
eta1 = rowMeans(X, dims = r),
|
||||
alphas = alphas,
|
||||
Omegas = Map(diag, p)
|
||||
)
|
||||
}
|
||||
|
||||
# parameters of the tensor normal computed from the GLM parameters
|
||||
params <- function(Fy, eta1, alphas, Omegas) {
|
||||
Deltas <- Map(solve, Omegas)
|
||||
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
|
||||
list(mu_y = mu_y, Deltas = Deltas)
|
||||
}
|
||||
|
||||
# scaled negative log-likelihood
|
||||
log.likelihood <- function(X, Fy, eta1, alphas, Omegas) {
|
||||
n <- tail(dim(X), 1) # sample size
|
||||
|
||||
# conditional mean
|
||||
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas))
|
||||
|
||||
# negative log-likelihood scaled by sample size
|
||||
# Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)`
|
||||
# migth fail, but `NAGD` has failsaves againt cases of "illegal"
|
||||
# parameters.
|
||||
suppressWarnings(
|
||||
0.5 * prod(p) * log(2 * pi) +
|
||||
sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) -
|
||||
(0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p)
|
||||
)
|
||||
}
|
||||
|
||||
# gradient of the scaled negative log-likelihood
|
||||
grad <- function(X, Fy, eta1, alphas, Omegas) {
|
||||
# retrieve dimensions
|
||||
n <- tail(dim(X), 1) # sample size
|
||||
p <- head(dim(X), -1) # predictor dimensions
|
||||
r <- length(p) # single predictor/response tensor order
|
||||
|
||||
## "Inverse" Link: Tensor Normal Specific
|
||||
# known exponential family constants
|
||||
c1 <- 1
|
||||
c2 <- -0.5
|
||||
|
||||
# Covariances from the GLM parameter Scatter matrices
|
||||
Deltas <- Map(solve, Omegas)
|
||||
|
||||
# First moment via "inverse" link `g1(eta_y) = E[X | Y = y]`
|
||||
E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
|
||||
|
||||
# Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]`
|
||||
dim(E1) <- c(prod(p), n)
|
||||
E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1))
|
||||
## end "Inverse" Link
|
||||
|
||||
dim(X) <- c(prod(p), n)
|
||||
|
||||
# Residuals
|
||||
R <- X - E1
|
||||
dim(R) <- c(p, n)
|
||||
|
||||
# mean deviation between the sample covariance to GLM estimated covariance
|
||||
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
|
||||
S <- rowMeans(colKronecker(X, X)) - E2 # <- Optimized for Tensor Normal
|
||||
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
|
||||
|
||||
# Gradients of the negative log-likelihood scaled by sample size
|
||||
list(
|
||||
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
|
||||
"Dl(alphas)" = Map(function(j) {
|
||||
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
|
||||
}, 1:r),
|
||||
"Dl(Omegas)" = Map(function(j) {
|
||||
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
|
||||
# addapt to symmetric constraint for the derivative
|
||||
dim(deriv) <- c(p[j], p[j])
|
||||
deriv + t(deriv * (1 - diag(p[j])))
|
||||
}, 1:r)
|
||||
)
|
||||
}
|
||||
|
||||
# mean conditional Fisher Information
|
||||
fisher.info <- function(Fy, eta1, alphas, Omegas) {
|
||||
# retrieve dimensions
|
||||
n <- tail(dim(Fy), 1) # sample size
|
||||
p <- dim(eta1) # predictor dimensions
|
||||
q <- head(dim(Fy), -1) # response dimensions
|
||||
r <- length(p) # single predictor/response tensor order
|
||||
|
||||
# Hij = Cov(ti(X) %x% tj(X) | Y = y), i, j = 1, 2
|
||||
H11 <- Reduce(`%x%`, rev(Map(solve, Omegas))) # covariance
|
||||
# 3rd central moment is zero
|
||||
H12 <- H21 <- 0
|
||||
# 4th moment by "Isserlis' theorem" a.k.a. "Wick's theorem"
|
||||
H22 <- kronecker(H11, H11)
|
||||
dim(H22) <- rep(prod(p), 4)
|
||||
H22 <- H22 + aperm(H22, c(1, 3, 2, 4)) + aperm(H22, c(1, 3, 2, 4))
|
||||
|
||||
dim(H11) <- c(p, p)
|
||||
dim(H22) <- c(p, p, p, p)
|
||||
|
||||
## Fisher Information: Tensor Normal Specific
|
||||
# known exponential family constants
|
||||
c1 <- 1
|
||||
c2 <- -0.5
|
||||
|
||||
# list of (Fy x_{k in [r]\j} alpha_j)
|
||||
Gys <- Map(function(j) {
|
||||
mlm(Fy, alphas[-j], (1:r)[-j])
|
||||
}, 1:r)
|
||||
|
||||
# setup indices of lower triangular matrix elements used for
|
||||
# all tuples (j, l) with 1 <= j <= l <= r.
|
||||
ltri <- lower.tri(diag(r), TRUE)
|
||||
J <- .col(c(r, r))[ltri]
|
||||
L <- .row(c(r, r))[ltri]
|
||||
|
||||
# inverse perfect outer shuffle of `2 r` elements
|
||||
iShuf <- rep(1:r, each = 2) + c(0, r)
|
||||
|
||||
# Getter for the i'th observation from Gys[[j]]
|
||||
# TODO: this is ugly, find a better (but still dynamic) version
|
||||
GyGet <- function(i, j) {
|
||||
obs <- eval(str2lang(paste(
|
||||
"Gys[[j]][",
|
||||
paste(rep(",", r), collapse = ""),
|
||||
"i, drop = FALSE]"
|
||||
, collapse = "")))
|
||||
dim(obs) <- head(dim(obs), -1)
|
||||
obs
|
||||
}
|
||||
|
||||
I11 <- c1^2 * mat(H11, 1:r)
|
||||
I12 <- Map(function(j) {
|
||||
i11 <- ttt(Gys[[j]], H11, (1:r)[-j])
|
||||
# take mean with respect to observations (second mode)
|
||||
i11 <- colMeans(mat(i11, 2))
|
||||
dim(i11) <- c(q[j], p[j], p)
|
||||
t(mat(i11, 2:1))
|
||||
}, 1:r)
|
||||
I13 <- Map(matrix, 0, prod(p), p^2)
|
||||
I22 <- Map(function(j, l) {
|
||||
i22 <- 0
|
||||
for (i in 1:n) {
|
||||
i22 <- i22 + ttt(
|
||||
ttt(GyGet(i, j), H11, (1:r)[-j]),
|
||||
GyGet(i, l), (1:r)[-l] + 2, (1:r)[-l])
|
||||
}
|
||||
(c1^2 / n) * mat(i22, 2:1)
|
||||
}, J, L)
|
||||
I23 <- Map(matrix, 0, (p * q)[J], (p^2)[L])
|
||||
|
||||
# shuffled H22
|
||||
sH22 <- c2^2 * aperm(H22, c(iShuf, iShuf + 2 * r))
|
||||
dim(sH22) <- c(p^2, p^2)
|
||||
I33 <- Map(function(j, l) {
|
||||
# second derivative (without constraints)
|
||||
deriv <- mlm(mlm(sH22, Map(c, Omegas[-j]), (1:r)[-j],
|
||||
transposed = TRUE),
|
||||
Map(c, Omegas[-l]), (1:r)[-l] + r,
|
||||
transposed = TRUE)
|
||||
dim(deriv) <- c(p[j], p[j], p[l], p[l])
|
||||
|
||||
# enforce symmetry of `Omega_j`
|
||||
of.diag <- (slice.index(deriv, 1:2) != slice.index(deriv, 2:1))
|
||||
deriv <- deriv + of.diag * aperm(deriv, c(2, 1, 3, 4))
|
||||
|
||||
# as well as the symmetry of `Omega_l`
|
||||
of.diag <- (slice.index(deriv, 3:4) != slice.index(deriv, 4:3))
|
||||
deriv <- deriv + of.diag * aperm(deriv, c(1, 2, 4, 3))
|
||||
|
||||
# matricize and return
|
||||
dim(deriv) <- c(p[j]^2, p[l]^2)
|
||||
deriv
|
||||
}, J, L)
|
||||
|
||||
names(I12) <- sprintf("I(eta1, alpha_%d)", 1:r)
|
||||
names(I13) <- sprintf("I(eta1, Omega_%d)", 1:r)
|
||||
names(I22) <- sprintf("I(alpha_%d, alpha_%d)", J, L)
|
||||
names(I23) <- sprintf("I(alpha_%d, Omega_%d)", J, L)
|
||||
names(I33) <- sprintf("I(Omega_%d, Omega_%d)", J, L)
|
||||
|
||||
list(I11 = I11, I12 = I12, I13 = I13, I22 = I22, I23 = I23, I33 = I33)
|
||||
}
|
||||
|
||||
# Hessian of the scaled negative log-likelihood
|
||||
hessian <- function(X, Fy, eta1, alphas, Omegas) {
|
||||
stop("Not Implemented")
|
||||
}
|
||||
},
|
||||
|
||||
########################################################################
|
||||
### Multi-Variate Bernoulli ###
|
||||
########################################################################
|
||||
bernoulli = {
|
||||
require(mvbernoulli)
|
||||
|
||||
initialize <- function(X, Fy) {
|
||||
p <- head(dim(X), -1)
|
||||
r <- length(p)
|
||||
|
||||
# Mode-Covariances
|
||||
XSigmas <- mcov(X, sample.axis = r + 1L)
|
||||
YSigmas <- mcov(Fy, sample.axis = r + 1L)
|
||||
|
||||
# Extract main mode covariance directions
|
||||
# Note: (the directions are transposed!)
|
||||
XDirs <- Map(function(Sigma) {
|
||||
SVD <- La.svd(Sigma, nu = 0)
|
||||
sqrt(SVD$d) * SVD$vt
|
||||
}, XSigmas)
|
||||
YDirs <- Map(function(Sigma) {
|
||||
SVD <- La.svd(Sigma, nu = 0)
|
||||
sqrt(SVD$d) * SVD$vt
|
||||
}, YSigmas)
|
||||
|
||||
alphas <- Map(function(xdir, ydir) {
|
||||
s <- min(ncol(xdir), nrow(ydir))
|
||||
crossprod(xdir[seq_len(s), , drop = FALSE],
|
||||
ydir[seq_len(s), , drop = FALSE])
|
||||
}, XDirs, YDirs)
|
||||
|
||||
list(
|
||||
eta1 = array(0, dim = p),
|
||||
alphas = alphas,
|
||||
Omegas = Map(diag, p)
|
||||
)
|
||||
}
|
||||
|
||||
params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
|
||||
# number of observations
|
||||
n <- tail(dim(Fy), 1)
|
||||
|
||||
# natural exponential family parameters
|
||||
eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1))
|
||||
eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas))
|
||||
|
||||
# # next the conditional Ising model parameters `theta_y`
|
||||
# theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n)
|
||||
# dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n)
|
||||
|
||||
# ltri <- which(lower.tri(eta_y2, diag = TRUE))
|
||||
# diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
|
||||
# theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1)
|
||||
# theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ]
|
||||
|
||||
# conditional Ising model parameters
|
||||
theta_y <- matrix(rep(vech(eta_y2), n), ncol = n)
|
||||
ltri <- which(lower.tri(eta_y2, diag = TRUE))
|
||||
diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
|
||||
theta_y[diagonal, ] <- eta_y1
|
||||
|
||||
theta_y
|
||||
}
|
||||
|
||||
# Scaled ngative log-likelihood
|
||||
log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
|
||||
# number of observations
|
||||
n <- tail(dim(X), 1L)
|
||||
|
||||
# conditional Ising model parameters
|
||||
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
|
||||
|
||||
# convert to binary data set
|
||||
storage.mode(X) <- "integer"
|
||||
X.mvb <- as.mvbinary(mat(X, length(dim(X))))
|
||||
|
||||
# log-likelihood of the data set
|
||||
-mean(sapply(seq_len(n), function(i) {
|
||||
ising_log_likelihood(theta_y[, i], X.mvb[i])
|
||||
}))
|
||||
}
|
||||
|
||||
# Gradient of the scaled negative log-likelihood
|
||||
grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
|
||||
# retrieve dimensions
|
||||
n <- tail(dim(X), 1) # sample size
|
||||
p <- head(dim(X), -1) # predictor dimensions
|
||||
r <- length(p) # single predictor/response tensor order
|
||||
|
||||
## "Inverse" Link: Ising Model Specific
|
||||
# conditional Ising model parameters: `p (p + 1) / 2` by `n`
|
||||
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
|
||||
|
||||
# conditional expectations
|
||||
# ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y]
|
||||
E2 <- apply(theta_y, 2L, ising_marginal_probs)
|
||||
# convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y]
|
||||
E2 <- E2[vech.pinv.index(prod(p)), ]
|
||||
# extract diagonal elements which are equal to E[vec(X) | Y = y]
|
||||
E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ]
|
||||
## end "Inverse" Link
|
||||
|
||||
dim(X) <- c(prod(p), n)
|
||||
|
||||
# Residuals
|
||||
R <- X - E1
|
||||
dim(R) <- c(p, n)
|
||||
|
||||
# mean deviation between the sample covariance to GLM estimated covariance
|
||||
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
|
||||
S <- rowMeans(colKronecker(X, X) - E2)
|
||||
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
|
||||
|
||||
# Gradients of the negative log-likelihood scaled by sample size
|
||||
list(
|
||||
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
|
||||
"Dl(alphas)" = Map(function(j) {
|
||||
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
|
||||
}, 1:r),
|
||||
"Dl(Omegas)" = Map(function(j) {
|
||||
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
|
||||
# addapt to symmetric constraint for the derivative
|
||||
dim(deriv) <- c(p[j], p[j])
|
||||
deriv + t(deriv * (1 - diag(p[j])))
|
||||
}, 1:r)
|
||||
)
|
||||
}
|
||||
|
||||
# Hessian of the scaled negative log-likelihood
|
||||
hessian <- function(X, Fy, alphas, Omegas, c1 = 1, c2 = 1) {
|
||||
stop("Not Implemented")
|
||||
}
|
||||
|
||||
# Conditional Fisher Information
|
||||
fisher.info <- function(Fy, alphas, Omegas) {
|
||||
stop("Not Implemented")
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
list(
|
||||
family = name,
|
||||
initialize = initialize,
|
||||
params = params,
|
||||
# linkinv = linkinv,
|
||||
log.likelihood = log.likelihood,
|
||||
grad = grad,
|
||||
hessian = hessian,
|
||||
fisher.info = fisher.info
|
||||
)
|
||||
}
|
|
@ -15,35 +15,43 @@
|
|||
#'
|
||||
#' @export
|
||||
matrixImage <- function(A, add.values = FALSE,
|
||||
main = NULL, sub = NULL, interpolate = FALSE, ...,
|
||||
main = NULL, sub = NULL, interpolate = FALSE, ..., zlim = NA,
|
||||
axes = TRUE, asp = 1, col = hcl.colors(24, "YlOrRd", rev = FALSE),
|
||||
digits = getOption("digits")
|
||||
) {
|
||||
# plot raster image
|
||||
plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "red",
|
||||
xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main, sub = sub)
|
||||
plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "black",
|
||||
xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main, sub = sub,
|
||||
asp = asp)
|
||||
|
||||
# Scale values of `A` to [0, 1] with min mapped to 1 and max to 0.
|
||||
if (missing(zlim)) {
|
||||
S <- (max(A) - A) / diff(range(A))
|
||||
} else {
|
||||
S <- pmin(pmax(0, (zlim[2] - A) / diff(zlim)), 1)
|
||||
}
|
||||
# and not transform to color
|
||||
S <- matrix(col[round((length(col) - 1) * S + 1)], nrow = nrow(A))
|
||||
|
||||
# Raster Image ploting the matrix with element values mapped to grayscale
|
||||
# as big elements (original matrix A) are dark and small (negative) elements
|
||||
# are white.
|
||||
rasterImage(S, 0, 0, ncol(A), nrow(A), interpolate = interpolate, ...)
|
||||
|
||||
# Add X-axis giving index
|
||||
# X/Y axes index (matches coordinates to matrix indices)
|
||||
x <- seq(1, ncol(A), by = 1)
|
||||
axis(1, at = x - 0.5, labels = x, lwd = 0, lwd.ticks = 1)
|
||||
# Add Y-axis
|
||||
y <- seq(1, nrow(A))
|
||||
if (axes) {
|
||||
axis(1, at = x - 0.5, labels = x, lwd = 0, lwd.ticks = 1)
|
||||
axis(2, at = y - 0.5, labels = rev(y), lwd = 0, lwd.ticks = 1, las = 1)
|
||||
}
|
||||
|
||||
# Writes matrix values (in colored element grids)
|
||||
# Writes matrix values
|
||||
if (any(add.values)) {
|
||||
if (length(add.values) > 1) {
|
||||
A[!add.values] <- NA
|
||||
A[add.values] <- format(A[add.values], digits = digits)
|
||||
}
|
||||
text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A,
|
||||
adj = 0.5, col = as.integer(S > 0.65))
|
||||
text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A, adj = 0.5)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,34 +3,101 @@
|
|||
#' @example inst/examples/num_deriv.R
|
||||
#'
|
||||
#' @export
|
||||
num.deriv <- function(F, X, h = 1e-6, sym = FALSE) {
|
||||
num.deriv <- function(expr, ..., h = 1e-6, sym = FALSE) {
|
||||
sexpr <- substitute(expr)
|
||||
if (...length() != 1) {
|
||||
stop("Expectd exactly one '...' variable")
|
||||
}
|
||||
var <- ...names()[1]
|
||||
if (is.null(var)) {
|
||||
arg <- substitute(...)
|
||||
var <- if (is.symbol(arg)) as.character(arg) else "x"
|
||||
}
|
||||
|
||||
if (is.language(sexpr) && !is.symbol(sexpr) && sexpr[[1]] == as.symbol("function")) {
|
||||
func <- expr
|
||||
} else {
|
||||
if (is.name(sexpr)) {
|
||||
expr <- call(as.character(sexpr), as.name(var))
|
||||
} else {
|
||||
if ((!is.call(sexpr) && !is.expression(sexpr))
|
||||
|| !(var %in% all.vars(sexpr))) {
|
||||
stop("'expr' must be a function or expression containing '", var, "'")
|
||||
}
|
||||
expr <- sexpr
|
||||
}
|
||||
|
||||
args <- as.pairlist(structure(list(alist(x = )[[1]]), names = var))
|
||||
func <- as.function(c(args, expr), envir = parent.frame())
|
||||
}
|
||||
|
||||
num.deriv.function(func, ..1, h = h, sym = sym)
|
||||
}
|
||||
|
||||
#' @rdname num.deriv
|
||||
#' @export
|
||||
num.deriv.function <- function(FUN, X, h = 1e-6, sym = FALSE) {
|
||||
if (sym) {
|
||||
stopifnot(isSymmetric(X))
|
||||
p <- nrow(X)
|
||||
k <- seq_along(X) - 1
|
||||
mapply(function(i, j) {
|
||||
dx <- h * ((k == i * p + j) | (k == j * p + i))
|
||||
(F(X + dx) - F(X - dx)) / (2 * h)
|
||||
(FUN(X + dx) - FUN(X - dx)) / (2 * h)
|
||||
}, .row(dim(X)) - 1, .col(dim(X)) - 1)
|
||||
} else {
|
||||
sapply(seq_along(X), function(i) {
|
||||
dx <- h * (seq_along(X) == i)
|
||||
(F(X + dx) - F(X - dx)) / (2 * h)
|
||||
(FUN(X + dx) - FUN(X - dx)) / (2 * h)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#' Second numeric derivative
|
||||
#'
|
||||
#' @rdname num.deriv
|
||||
#' @export
|
||||
num.deriv2 <- function(F, X, Y, h = 1e-6, symX = FALSE, symY = FALSE) {
|
||||
num.deriv2 <- function(FUN, X, Y, h = 1e-6, symX = FALSE, symY = FALSE) {
|
||||
if (missing(Y)) {
|
||||
num.deriv(function(x) {
|
||||
num.deriv(function(z) F(z), x, h = h, sym = symX)
|
||||
num.deriv.function(function(x) {
|
||||
num.deriv.function(FUN, x, h = h, sym = symX)
|
||||
}, X, h = h, sym = symX)
|
||||
} else {
|
||||
num.deriv(function(y) {
|
||||
num.deriv(function(x) F(x, y), X, h = h, sym = symX)
|
||||
num.deriv.function(function(y) {
|
||||
num.deriv.function(function(x) FUN(x, y), X, h = h, sym = symX)
|
||||
}, Y, h = h, sym = symY)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
### Interface Idea / Demo
|
||||
# num.deriv2.function
|
||||
# num.deriv2 <- function(expr, var) {
|
||||
# sexpr <- substitute(expr)
|
||||
# svar <- substitute(var)
|
||||
#
|
||||
# if (is.language(sexpr) && !is.symbol(sexpr) && sexpr[[1]] == as.symbol("function")) {
|
||||
# func <- expr
|
||||
# } else {
|
||||
# if (is.name(sexpr)) {
|
||||
# expr <- call(as.character(sexpr), as.name(svar))
|
||||
# } else {
|
||||
# if ((!is.call(sexpr) && !is.expression(sexpr))
|
||||
# || !(as.character(svar) %in% all.vars(sexpr))) {
|
||||
# stop("'expr' must be a function or expression containing '",
|
||||
# as.character(svar), "'")
|
||||
# }
|
||||
# expr <- sexpr
|
||||
# }
|
||||
#
|
||||
# args <- as.pairlist(structure(list(alist(x = )[[1]]), names = as.character(svar)))
|
||||
# func <- as.function(c(args, expr), envir = parent.frame())
|
||||
# }
|
||||
#
|
||||
# num.deriv2.function(func, var)
|
||||
# }
|
||||
# y <- c(pi, exp(1), (sqrt(5) + 1) / 2)
|
||||
# num.deriv2(function(x) x^2 + y, x)(1:3)
|
||||
# num.deriv2(x^2 + y, x)(1:3)
|
||||
# func <- function(x) x^2 + y
|
||||
# num.deriv2(func, x)(1:3)
|
||||
# func2 <- function(z = y, x) x^2 + z
|
||||
# num.deriv2(func2, x)(1:3)
|
||||
|
|
|
@ -4,15 +4,17 @@ A <- A + t(A)
|
|||
B <- matrix(rnorm(3 * 4), 3, 4)
|
||||
|
||||
# F(A) = A B
|
||||
# DF(A) = B' x I
|
||||
stopifnot(all.equal(
|
||||
num.deriv(function(X) X %*% B, A),
|
||||
# DF(A) = B' %x% I
|
||||
exprs.all.equal({
|
||||
t(B) %x% diag(nrow(A))
|
||||
))
|
||||
num.deriv(function(X) X %*% B, A)
|
||||
num.deriv(A %*% B, A)
|
||||
num.deriv(X %*% B, X = A)
|
||||
})
|
||||
|
||||
# Symmetric case, constraint A = A' (equiv to being a function of vech(A) only)
|
||||
# F(A) = A B for A = A'
|
||||
# DF(A) = B' x I
|
||||
# DF(A) = B' %x% I
|
||||
stopifnot(all.equal(
|
||||
num.deriv(function(X) X %*% B, A, sym = TRUE),
|
||||
(t(B) %x% diag(nrow(A))) %*% D(nrow(A)) %*% t(D(nrow(A)))
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
library(testthat)
|
||||
library(tensorPredictors)
|
||||
|
||||
test_check("tensorPredictors")
|
|
@ -0,0 +1,50 @@
|
|||
# F(X) = log(det(X)) if det(X) > 0
|
||||
# dF(X) = tr(X^-1 dX)
|
||||
# DF(X) = vec((X^-1)')'
|
||||
test_that("Matrix Calculus 1", {
|
||||
# test data
|
||||
X <- tcrossprod(diag(5) + matrix(runif(5^2, -.1, .1), 5))
|
||||
|
||||
num.grad <- num.deriv(function(X) log(det(X)), X)
|
||||
ana.grad <- c(solve(X))
|
||||
|
||||
expect_equal(num.grad, ana.grad)
|
||||
})
|
||||
|
||||
# F(mu) = <X - mu, (X - mu) x_{k in [r]} Delta_K^-1> for Delta_k = Delta_k'
|
||||
# DF(mu) = -2 vec((X - mu) x_{k in [r]} Delta_K^-1)'
|
||||
test_that("Matrix Calculus 2", {
|
||||
p <- c(2, 3, 5)
|
||||
|
||||
X <- array(rnorm(prod(p)), dim = p)
|
||||
mu <- array(rnorm(prod(p)), dim = p)
|
||||
Deltas <- Map(function(pj) tcrossprod(diag(pj) + runif(pj^2, -0.1, 0.1)), p)
|
||||
|
||||
ana.grad <- -2 * c(mlm(X - mu, Map(solve, Deltas)))
|
||||
num.grad <- num.deriv(function(mu) sum((X - mu) * mlm(X - mu, Map(solve, Deltas))), mu)
|
||||
|
||||
expect_equal(num.grad, ana.grad)
|
||||
})
|
||||
|
||||
# F(Delta_j) = <X - mu, (X - mu) x_{k in [r]} Delta_K^-1> for Delta_k = Delta_k'
|
||||
# DF(Delta_j) = -((X - mu) x_{k in [r]} Delta_K^-1)_(j) (X - mu)_(j)' Delta_j^-1
|
||||
test_that("Matrix Calculus 3", {
|
||||
# config
|
||||
p <- c(2, 3, 5)
|
||||
|
||||
# generate test data
|
||||
X <- array(rnorm(prod(p)), dim = p)
|
||||
mu <- array(rnorm(prod(p)), dim = p)
|
||||
Deltas <- Map(function(pj) tcrossprod(diag(pj) + runif(pj^2, -0.1, 0.1)), p)
|
||||
|
||||
# check analytic to numeric derivatives
|
||||
for (j in seq_along(Deltas)) {
|
||||
num.grad <- matrix(num.deriv(function(Delta_j) {
|
||||
Deltas[[j]] <- Delta_j
|
||||
sum((X - mu) * mlm(X - mu, Map(solve, Deltas)))
|
||||
}, Deltas[[j]]), p[j])
|
||||
ana.grad <- -mcrossprod(mlm(X - mu, Map(solve, Deltas)), X - mu, j) %*% solve(Deltas[[j]])
|
||||
|
||||
expect_equal(num.grad, ana.grad)
|
||||
}
|
||||
})
|
|
@ -0,0 +1,186 @@
|
|||
test_that("Tensor-Normal Score", {
|
||||
# setup dimensions
|
||||
n <- 11L
|
||||
p <- c(3L, 5L, 4L)
|
||||
q <- c(2L, 7L, 3L)
|
||||
|
||||
# create "true" GLM parameters
|
||||
eta1 <- array(rnorm(prod(p)), dim = p)
|
||||
alphas <- Map(matrix, Map(rnorm, p * q), p)
|
||||
Omegas <- Map(function(pj) {
|
||||
solve(0.5^abs(outer(seq_len(pj), seq_len(pj), `-`)))
|
||||
}, p)
|
||||
params <- list(eta1, alphas, Omegas)
|
||||
|
||||
# compute tensor normal parameters from GLM parameters
|
||||
Deltas <- Map(solve, Omegas)
|
||||
mu <- mlm(eta1, Deltas)
|
||||
|
||||
# sample some test data
|
||||
sample.axis <- length(p) + 1L
|
||||
Fy <- array(rnorm(n * prod(q)), dim = c(q, n))
|
||||
X <- mlm(Fy, Map(`%*%`, Deltas, alphas)) + rtensornorm(n, mu, Deltas, sample.axis)
|
||||
|
||||
# Create a GLM family object
|
||||
family <- make.gmlm.family("normal")
|
||||
|
||||
# first unpack the family object
|
||||
log.likelihood <- family$log.likelihood
|
||||
grad <- family$grad
|
||||
|
||||
# compare numeric gradient against the analytic gradient provided by the
|
||||
# family at the initial parameters
|
||||
grad.num <- do.call(function(eta1, alphas, Omegas) {
|
||||
list(
|
||||
"Dl(eta1)" = num.deriv.function(function(eta1) {
|
||||
log.likelihood(X, Fy, eta1, alphas, Omegas)
|
||||
}, eta1),
|
||||
"Dl(alphas)" = Map(function(j) num.deriv.function(function(alpha_j) {
|
||||
alphas[[j]] <- alpha_j
|
||||
log.likelihood(X, Fy, eta1, alphas, Omegas)
|
||||
}, alphas[[j]]), seq_along(alphas)),
|
||||
"Dl(Omegas)" = Map(function(j) num.deriv.function(function(Omega_j) {
|
||||
Omegas[[j]] <- Omega_j
|
||||
log.likelihood(X, Fy, eta1, alphas, Omegas)
|
||||
}, Omegas[[j]], sym = TRUE), seq_along(Omegas))
|
||||
)
|
||||
}, params)
|
||||
grad.ana <- do.call(grad, c(list(X, Fy), params))
|
||||
|
||||
expect_equal(RMap(c, grad.ana), grad.num, tolerance = 1e-6)
|
||||
|
||||
# test for correct dimensions
|
||||
ana.dims <- list(
|
||||
"Dl(eta1)" = p,
|
||||
"Dl(alphas)" = Map(c, p, q),
|
||||
"Dl(Omegas)" = Map(c, p, p)
|
||||
)
|
||||
|
||||
expect_identical(RMap(dim, grad.ana), ana.dims)
|
||||
})
|
||||
|
||||
test_that("Tensor-Normal moments of the sufficient statistic t(X)", {
|
||||
# config
|
||||
# # (estimated) sample size for sample estimates
|
||||
# n <- 250000 # sample estimates are too unreliable for testing purposes
|
||||
p <- c(2L, 3L)
|
||||
r <- length(p)
|
||||
|
||||
# setup tensor normal parameters (NO dependence of Fy, set to zero)
|
||||
mu <- array(runif(prod(p), min = -0.5, max = 0.5), dim = p) # = mu_y
|
||||
Deltas <- Map(function(pj) 0.5^abs(outer(1:pj, 1:pj, `-`)), p)
|
||||
|
||||
# compute GMLM parameters (NO alphas as Fy is zero)
|
||||
Omegas <- Map(solve, Deltas)
|
||||
eta1 <- mlm(mu, Omegas)
|
||||
|
||||
# natural parameters of the tensor normal
|
||||
eta_y1 <- eta1 # + mlm(Fy, alphas) = + 0
|
||||
eta_y2 <- -0.5 * Reduce(`%x%`, rev(Omegas))
|
||||
|
||||
# draw a sample
|
||||
X <- rtensornorm(n, mu, Deltas, r + 1L)
|
||||
|
||||
# tensor normal log-partition function given eta_y
|
||||
log.partition <- function(eta_y1, eta_y2) {
|
||||
# eta_y1 as "model" matrix of dimensions `n by p` where `n` might be `1`
|
||||
if (length(eta_y1)^2 == length(eta_y2)) {
|
||||
pp <- length(eta_y1)
|
||||
dim(eta_y1) <- c(1L, pp)
|
||||
} else {
|
||||
eta_y1 <- mat(eta_y1, r + 1L)
|
||||
pp <- ncol(eta_y1)
|
||||
}
|
||||
|
||||
# treat eta_y2 as square matrix
|
||||
if (!is.matrix(eta_y2)) {
|
||||
dim(eta_y2) <- c(pp, pp)
|
||||
}
|
||||
|
||||
# evaluate log-partiton function in terms of natural parameters
|
||||
-0.25 * pp * mean((eta_y1 %*% solve(eta_y2)) * eta_y1) - 0.5 * log(det(-2 * eta_y2))
|
||||
}
|
||||
|
||||
|
||||
# (analytic) first and second (un-centered) moment of the tensor normal
|
||||
# Eti = E[ti(X) | Y = y]
|
||||
Et1.ana <- mu
|
||||
Et2.ana <- Reduce(`%x%`, rev(Deltas)) + outer(c(mu), c(mu))
|
||||
|
||||
# (numeric) derivative of the log-partition function with respect to the natural
|
||||
# parameters of the exponential family form of the tensor normal
|
||||
Et1.num <- num.deriv(log.partition(eta_y1, eta_y2), eta_y1)
|
||||
Et2.num <- num.deriv(log.partition(eta_y1, eta_y2), eta_y2)
|
||||
|
||||
# # (estimated) estimate from sample
|
||||
# Et1.est <- rowMeans(X, dims = r)
|
||||
# Et2.est <- colMeans(rowKronecker(mat(X, r + 1L), mat(X, r + 1L)))
|
||||
|
||||
# (analytic) convariance blocks of the sufficient statistic
|
||||
# Ctij = Cov(ti(X), tj(X) | Y = y)
|
||||
Ct11.ana <- Reduce(`%x%`, rev(Deltas))
|
||||
Ct12.ana <- local({
|
||||
# Analytic solution / `vec(mu %x% Delta) + vec(mu' %x% Delta)`
|
||||
ct12 <- 2 * c(mu) %x% Reduce(`%x%`, rev(Deltas))
|
||||
# and symmetrize!
|
||||
dim(ct12) <- rep(prod(p), 3)
|
||||
(1 / 2) * (ct12 + aperm(ct12, c(3, 1, 2)))
|
||||
})
|
||||
Ct22.ana <- local({
|
||||
Delta <- Reduce(`%x%`, rev(Deltas)) # Ct11.ana
|
||||
ct22 <- (2 * Delta + 4 * outer(c(mu), c(mu))) %x% Delta
|
||||
|
||||
# TODO: What does this symmetrization do exactly? And why?!
|
||||
dim(ct22) <- rep(prod(p), 4)
|
||||
(1 / 4) * (
|
||||
aperm(ct22, c(2, 3, 1, 4)) +
|
||||
aperm(ct22, c(2, 3, 4, 1)) +
|
||||
aperm(ct22, c(3, 2, 1, 4)) +
|
||||
aperm(ct22, c(3, 2, 4, 1))
|
||||
)
|
||||
})
|
||||
|
||||
# (numeric) second derivative of the log-partition function
|
||||
Ct11.num <- num.deriv2(function(eta_y1) log.partition(eta_y1, eta_y2), eta_y1)
|
||||
Ct12.num <- num.deriv2(log.partition, eta_y1, eta_y2)
|
||||
Ct22.num <- local({
|
||||
ct22 <- num.deriv2(function(eta_y2) log.partition(eta_y1, eta_y2), eta_y2)
|
||||
# TODO: Why does this need to be symmetrized?!
|
||||
dim(ct22) <- rep(prod(p), 4)
|
||||
(1 / 4) * (
|
||||
aperm(ct22, c(3, 4, 2, 1)) +
|
||||
aperm(ct22, c(4, 3, 2, 1)) +
|
||||
aperm(ct22, c(3, 4, 1, 2)) +
|
||||
aperm(ct22, c(4, 3, 1, 2))
|
||||
)
|
||||
})
|
||||
|
||||
# # (estimated) sample estimates of the sufficient statistic convariance blocks
|
||||
# T1 <- mat(X, r + 1L)
|
||||
# T2 <- rowKronecker(T1, T1)
|
||||
# Ct11.est <- cov(T1)
|
||||
# Ct12.est <- cov(T1, T2)
|
||||
# Ct22.est <- cov(T2, T2)
|
||||
|
||||
|
||||
tolerance <- 1e-3
|
||||
expect_equal(c(Et1.ana), c(Et1.num), tolerance = tolerance)
|
||||
# expect_equal(c(Et1.ana), c(Et1.est), tolerance = 0.05, scale = 1)
|
||||
# expect_equal(c(Et1.num), c(Et1.est), tolerance = 0.05, scale = 1)
|
||||
|
||||
expect_equal(c(Et2.ana), c(Et2.num), tolerance = tolerance)
|
||||
# expect_equal(c(Et2.ana), c(Et2.est), tolerance = 0.05, scale = 1)
|
||||
# expect_equal(c(Et2.num), c(Et2.est), tolerance = 0.05, scale = 1)
|
||||
|
||||
expect_equal(c(Ct11.ana), c(Ct11.num), tolerance = tolerance)
|
||||
# expect_equal(c(Ct11.ana), c(Ct11.est), tolerance = 0.05, scale = 1)
|
||||
# expect_equal(c(Ct11.num), c(Ct11.est), tolerance = 0.05, scale = 1)
|
||||
|
||||
expect_equal(c(Ct12.ana), c(Ct12.num), tolerance = tolerance)
|
||||
# expect_equal(c(Ct12.ana), c(Ct12.est), tolerance = 0.05, scale = 1)
|
||||
# expect_equal(c(Ct12.num), c(Ct12.est), tolerance = 0.05, scale = 1)
|
||||
|
||||
expect_equal(c(Ct22.ana), c(Ct22.num), tolerance = tolerance)
|
||||
# expect_equal(c(Ct22.ana), c(Ct22.est), tolerance = 0.05, scale = 1)
|
||||
# expect_equal(c(Ct22.num), c(Ct22.est), tolerance = 0.05, scale = 1)
|
||||
})
|
Loading…
Reference in New Issue