fix: some typos in GMLM.tex,

add: extended interface of num.deriv and matrixImage,
add: "tests",
add: moved make.gmlm.family into seperate file
This commit is contained in:
Daniel Kapla 2022-10-31 15:14:58 +01:00
parent 79f16c58dc
commit 2dfdc7083b
11 changed files with 746 additions and 298 deletions

View File

@ -139,11 +139,12 @@
\maketitle \maketitle
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{abstract} \begin{abstract}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We consider regression and classification for \textit{general} response and tensor-valued predictors (multi dimensional arrays) and propose a \textit{novel formulation} for method for sufficient dimension reduction. Assumingof Tensor-valued predictor (multi dimensional arrays) for regression or classification. the distribution of the tensor-valued predictors given the response is in theWe assume an qQuadratic eExponential fFamily, we model the natural parameter for a Generalized Linear Model in an inverse regression setting where the relation via a link is of as a multi-linear function of the responsnature. We consider regression and classification for \textit{general} response and tensor-valued predictors (multi dimensional arrays) and propose a \textit{novel formulation} for sufficient dimension reduction. Assuming the distribution of the tensor-valued predictors given the response is in the quadratic exponential family, we model the natural parameter as a multi-linear function of the respons.
ThisUsing a multi-linear relation allows to perform per-axis reductions that drastically which reduces the total number of parameters drastically for higher order tTensor-valued predictors. WUnder the Exponential Family we derive maximum likelihood estimates for the multi-linear sufficient dimension reduction and of the Tensor-valued predictors. Furthermore, we provide a computationally efficient n estimation algorithm which leveragutilizes the tTensor structure allowing efficient implementations. The performance of the method is illustrated via simulations and real world examples are provided. This allows per-axis reductions that drastically reduce the total number of parameters for higher order tensor-valued predictors. We derive maximum likelihood estimates for the sufficient dimension reduction and a computationally efficient estimation algorithm which leveraes the tensor structure. The performance of the method is illustrated via simulations and real world examples are provided.
\end{abstract} \end{abstract}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -523,13 +524,13 @@ The next step is to identify the Hessians from the second differentials in a sim
&\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \vec\biggl( (\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j) \Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))} \biggr) \\ &\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \vec\biggl( (\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j) \Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))} \biggr) \\
&= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big)_{(j)}\otimes\mat{I}_{p_j} \biggr) \mat{K}_{\mat{p},(j)}\mat{H}_{1,1}\t{\mat{K}_{\mat{p},(l)}} \biggl( \t{\Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big)_{(l)}}\otimes\mat{I}_{p_l} \biggr)\vec{\d\mat{\alpha}_l} \\ &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big)_{(j)}\otimes\mat{I}_{p_j} \biggr) \mat{K}_{\mat{p},(j)}\mat{H}_{1,1}\t{\mat{K}_{\mat{p},(l)}} \biggl( \t{\Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big)_{(l)}}\otimes\mat{I}_{p_l} \biggr)\vec{\d\mat{\alpha}_l} \\
&\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \biggl( \t{\Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))}}\otimes\mat{I}_{p_j p_l} \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\ &\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \biggl( \t{\Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))}}\otimes\mat{I}_{p_j p_l} \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\
&= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \vec{\d\mat{\alpha}_l} \\ &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \vec{\d\mat{\alpha}_l} \\
&\qquad + c_1 \vec\biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\ &\qquad + c_1 \vec\biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\
&= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \vec{\d\mat{\alpha}_l} \\ &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \vec{\d\mat{\alpha}_l} \\
&\qquad + c_1 \t{(\vec{\d\mat{\alpha}_j})} \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3))} \vec{\d\mat{\alpha}_l} \\ &\qquad + c_1 \t{(\vec{\d\mat{\alpha}_j})} \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3))} \vec{\d\mat{\alpha}_l} \\
&\qquad \begin{aligned} &\qquad \begin{aligned}
\Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\alpha}_l})}} &= \Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\alpha}_l})}} &=
-c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \\ -c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\
&\qquad + c_1 \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3) + [[j > l]])} &\qquad + c_1 \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3) + [[j > l]])}
\qquad{\color{gray} (p_j q_j \times p_l q_l)} \qquad{\color{gray} (p_j q_j \times p_l q_l)}
\end{aligned} \end{aligned}
@ -591,22 +592,23 @@ and with that all auxiliary calculation are done. Using the two expectations yie
\mathcal{I}_{2 r + 1,1} & \mathcal{I}_{2 r + 1,1} & \cdots & \mathcal{I}_{2 r + 1, 2 r + 1} \mathcal{I}_{2 r + 1,1} & \mathcal{I}_{2 r + 1,1} & \cdots & \mathcal{I}_{2 r + 1, 2 r + 1}
\end{pmatrix} \end{pmatrix}
\end{displaymath} \end{displaymath}
for $j, l = 1, ..., r$ as follows and for every block holds $\mathcal{I}_{j, l} = \t{\mathcal{I}_{l, j}}$. The individual blocks are given with $j = 1, ..., r$ and $j \leq l \leq r$ by
\begin{align*} \begin{align*}
\mathcal{I}_{1,1} &= c_1^2 (\ten{H}_{1,1})_{([r])} \\ \mathcal{I}_{1,1} &= c_1^2 (\ten{H}_{1,1})_{([r])} \\
\mathcal{I}_{1,j+1} = \t{\mathcal{I}_{j+1,1}} \mathcal{I}_{1,j+1} % = \E\partial_{\vec{\overline{\ten{\eta}}_1}}\partial_{\t{(\vec{\mat{\alpha}_j})}} l(\mat{\Theta})\mid \ten{Y} = y
&= c_1^2 \Big[\Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1}\Big]_{((2, 1))} \\ &= c_1^2 \Big[\Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1}\Big]_{((2, 1))} \\
\mathcal{I}_{1,j+1} = \t{\mathcal{I}_{j+1,1}} \mathcal{I}_{1,j+r+1}
&= c_1 c_2 \Big( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)} \\ &= c_1 c_2 \Big( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)} \\
\mathcal{I}_{j+1,l+1} = \t{\mathcal{I}_{l+1,j+1}} \mathcal{I}_{j+1,l+1}
&= c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\ &= c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\
\mathcal{I}_{j+1,l+r+1} = \t{\mathcal{I}_{l+r+1,j+1}} \mathcal{I}_{j+1,l+r+1}
&= c_1 c_2 \biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{((r + 2, 1))} \\ &= c_1 c_2 \biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{((r + 2, 1))} \\
\mathcal{I}_{j+r+1,l+r+1} = \t{\mathcal{I}_{l+r+1,j+r+1}} \mathcal{I}_{j+r+1,l+r+1}
&= c_2^2 \Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)} &= c_2^2 \Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}
\end{align*} \end{align*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Vectorization and Matricization} \section{Vectorization and Matricization}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -615,7 +617,7 @@ The \emph{matricization} is a generalization of the \emph{vectorization} operati
\begin{theorem}\label{thm:mlm_mat} \begin{theorem}\label{thm:mlm_mat}
Let $\ten{A}$ be a tensor of order $r$ with the dimensions $q_1\times ... \times q_r$. Furthermore, let for $k = 1, ..., r$ be $\mat{B}_k$ matrices of dimensions $p_k\times q_k$. Then, for any $(\mat{i}, \mat{j})\in\perm{r}$ holds Let $\ten{A}$ be a tensor of order $r$ with the dimensions $q_1\times ... \times q_r$. Furthermore, let for $k = 1, ..., r$ be $\mat{B}_k$ matrices of dimensions $p_k\times q_k$. Then, for any $(\mat{i}, \mat{j})\in\perm{r}$ holds
\begin{displaymath} \begin{displaymath}
(\ten{A}\times_{k\in[r]}\mat{B}_k)_{(\mat{i}, \mat{j})} \Big(\ten{A}\mlm{k\in[r]}\mat{B}_k\Big)_{(\mat{i}, \mat{j})}
= \Big(\bigotimes_{k = \len{\mat{i}}}^{1}\mat{B}_{\mat{i}_k}\Big) \ten{A}_{(\mat{i}, \mat{j})} \Big(\bigotimes_{k = \len{\mat{j}}}^{1}\t{\mat{B}_{\mat{j}_k}}\Big). = \Big(\bigotimes_{k = \len{\mat{i}}}^{1}\mat{B}_{\mat{i}_k}\Big) \ten{A}_{(\mat{i}, \mat{j})} \Big(\bigotimes_{k = \len{\mat{j}}}^{1}\t{\mat{B}_{\mat{j}_k}}\Big).
\end{displaymath} \end{displaymath}
\end{theorem} \end{theorem}

View File

@ -47,6 +47,7 @@ export(mkm)
export(mlm) export(mlm)
export(mtvk) export(mtvk)
export(num.deriv) export(num.deriv)
export(num.deriv.function)
export(num.deriv2) export(num.deriv2)
export(reduce) export(reduce)
export(rowKronecker) export(rowKronecker)

View File

@ -5,265 +5,6 @@ GMLM <- function(...) {
stop("Not Implemented") stop("Not Implemented")
} }
make.gmlm.family <- function(name) {
# standardize family name
name <- list(
normal = "normal", gaussian = "normal",
bernoulli = "bernoulli", ising = "bernoulli"
)[[tolower(name), exact = FALSE]]
############################################################################
# #
# TODO: better (and possibly specialized) initial parameters!?!?!?! #
# #
############################################################################
switch(name,
normal = {
initialize <- function(X, Fy) {
p <- head(dim(X), -1)
r <- length(p)
# Mode-Covariances
XSigmas <- mcov(X, sample.axis = r + 1L)
YSigmas <- mcov(Fy, sample.axis = r + 1L)
# Extract main mode covariance directions
# Note: (the directions are transposed!)
XDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, XSigmas)
YDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, YSigmas)
alphas <- Map(function(xdir, ydir) {
s <- min(ncol(xdir), nrow(ydir))
crossprod(xdir[seq_len(s), , drop = FALSE],
ydir[seq_len(s), , drop = FALSE])
}, XDirs, YDirs)
list(
eta1 = rowMeans(X, dims = r),
alphas = alphas,
Omegas = Map(diag, p)
)
}
# parameters of the tensor normal computed from the GLM parameters
params <- function(Fy, eta1, alphas, Omegas) {
Deltas <- Map(solve, Omegas)
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
list(mu_y = mu_y, Deltas = Deltas)
}
# scaled negative log-likelihood
log.likelihood <- function(X, Fy, eta1, alphas, Omegas) {
n <- tail(dim(X), 1) # sample size
# conditional mean
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas))
# negative log-likelihood scaled by sample size
# Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)`
# migth fail, but `NAGD` has failsaves againt cases of "illegal"
# parameters.
suppressWarnings(
0.5 * prod(p) * log(2 * pi) +
sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) -
(0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p)
)
}
# gradient of the scaled negative log-likelihood
grad <- function(X, Fy, eta1, alphas, Omegas) {
# retrieve dimensions
n <- tail(dim(X), 1) # sample size
p <- head(dim(X), -1) # predictor dimensions
r <- length(p) # single predictor/response tensor order
## "Inverse" Link: Tensor Normal Specific
# known exponential family constants
c1 <- 1
c2 <- -0.5
# Covariances from the GLM parameter Scatter matrices
Deltas <- Map(solve, Omegas)
# First moment via "inverse" link `g1(eta_y) = E[X | Y = y]`
E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
# Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]`
dim(E1) <- c(prod(p), n)
E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1))
## end "Inverse" Link
dim(X) <- c(prod(p), n)
# Residuals
R <- X - E1
dim(R) <- c(p, n)
# mean deviation between the sample covariance to GLM estimated covariance
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
S <- rowMeans(colKronecker(X, X)) - E2 # <- Optimized for Tensor Normal
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
# Gradients of the negative log-likelihood scaled by sample size
list(
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
"Dl(alphas)" = Map(function(j) {
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
}, 1:r),
"Dl(Omegas)" = Map(function(j) {
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
# addapt to symmetric constraint for the derivative
dim(deriv) <- c(p[j], p[j])
deriv + t(deriv * (1 - diag(p[j])))
}, 1:r)
)
}
},
bernoulli = {
require(mvbernoulli)
initialize <- function(X, Fy) {
p <- head(dim(X), -1)
r <- length(p)
# Mode-Covariances
XSigmas <- mcov(X, sample.axis = r + 1L)
YSigmas <- mcov(Fy, sample.axis = r + 1L)
# Extract main mode covariance directions
# Note: (the directions are transposed!)
XDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, XSigmas)
YDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, YSigmas)
alphas <- Map(function(xdir, ydir) {
s <- min(ncol(xdir), nrow(ydir))
crossprod(xdir[seq_len(s), , drop = FALSE],
ydir[seq_len(s), , drop = FALSE])
}, XDirs, YDirs)
list(
eta1 = array(0, dim = p),
alphas = alphas,
Omegas = Map(diag, p)
)
}
params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
# number of observations
n <- tail(dim(Fy), 1)
# natural exponential family parameters
eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1))
eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas))
# # next the conditional Ising model parameters `theta_y`
# theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n)
# dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n)
# ltri <- which(lower.tri(eta_y2, diag = TRUE))
# diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
# theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1)
# theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ]
# conditional Ising model parameters
theta_y <- matrix(rep(vech(eta_y2), n), ncol = n)
ltri <- which(lower.tri(eta_y2, diag = TRUE))
diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
theta_y[diagonal, ] <- eta_y1
theta_y
}
# Scaled ngative log-likelihood
log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
# number of observations
n <- tail(dim(X), 1L)
# conditional Ising model parameters
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
# convert to binary data set
storage.mode(X) <- "integer"
X.mvb <- as.mvbinary(mat(X, length(dim(X))))
# log-likelihood of the data set
-mean(sapply(seq_len(n), function(i) {
ising_log_likelihood(theta_y[, i], X.mvb[i])
}))
}
# Gradient of the scaled negative log-likelihood
grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
# retrieve dimensions
n <- tail(dim(X), 1) # sample size
p <- head(dim(X), -1) # predictor dimensions
r <- length(p) # single predictor/response tensor order
## "Inverse" Link: Ising Model Specific
# conditional Ising model parameters: `p (p + 1) / 2` by `n`
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
# conditional expectations
# ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y]
E2 <- apply(theta_y, 2L, ising_marginal_probs)
# convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y]
E2 <- E2[vech.pinv.index(prod(p)), ]
# extract diagonal elements which are equal to E[vec(X) | Y = y]
E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ]
## end "Inverse" Link
dim(X) <- c(prod(p), n)
# Residuals
R <- X - E1
dim(R) <- c(p, n)
# mean deviation between the sample covariance to GLM estimated covariance
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
S <- rowMeans(colKronecker(X, X) - E2)
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
# Gradients of the negative log-likelihood scaled by sample size
list(
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
"Dl(alphas)" = Map(function(j) {
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
}, 1:r),
"Dl(Omegas)" = Map(function(j) {
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
# addapt to symmetric constraint for the derivative
dim(deriv) <- c(p[j], p[j])
deriv + t(deriv * (1 - diag(p[j])))
}, 1:r)
)
}
}
)
list(
family = name,
initialize = initialize,
params = params,
# linkinv = linkinv,
log.likelihood = log.likelihood,
grad = grad
)
}
#' @export #' @export
GMLM.default <- function(X, Fy, sample.axis = 1L, GMLM.default <- function(X, Fy, sample.axis = 1L,
family = "normal", family = "normal",
@ -292,6 +33,7 @@ GMLM.default <- function(X, Fy, sample.axis = 1L,
} }
} }
# optimize likelihood using Nesterov Excelerated Gradient Descent
params.fit <- NAGD( params.fit <- NAGD(
fun.loss = function(params) { fun.loss = function(params) {
# scaled negative lig-likelihood # scaled negative lig-likelihood

View File

@ -24,7 +24,7 @@
#' matrix(1, 2, 3) #' matrix(1, 2, 3)
#' array(rep(1, 6), dim = c(2, 3)) #' array(rep(1, 6), dim = c(2, 3))
#' }) #' })
#' # basicaly identical to #' # is identical to
#' stopifnot(exprs = { #' stopifnot(exprs = {
#' all.equal(matrix(rep(1, 6), 2, 3), matrix(1, 2, 3)) #' all.equal(matrix(rep(1, 6), 2, 3), matrix(1, 2, 3))
#' all.equal(matrix(rep(1, 6), 2, 3), array(rep(1, 6), dim = c(2, 3))) #' all.equal(matrix(rep(1, 6), 2, 3), array(rep(1, 6), dim = c(2, 3)))

View File

@ -0,0 +1,386 @@
make.gmlm.family <- function(name) {
# standardize family name
name <- list(
normal = "normal", gaussian = "normal",
bernoulli = "bernoulli", ising = "bernoulli"
)[[tolower(name), exact = FALSE]]
############################################################################
# #
# TODO: better (and possibly specialized) initial parameters!?!?!?! #
# #
############################################################################
switch(name,
########################################################################
### Tensor Normal ###
########################################################################
normal = {
initialize <- function(X, Fy) {
p <- head(dim(X), -1)
r <- length(p)
# Mode-Covariances
XSigmas <- mcov(X, sample.axis = r + 1L)
YSigmas <- mcov(Fy, sample.axis = r + 1L)
# Extract main mode covariance directions
# Note: (the directions are transposed!)
XDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, XSigmas)
YDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, YSigmas)
alphas <- Map(function(xdir, ydir) {
s <- min(ncol(xdir), nrow(ydir))
crossprod(xdir[seq_len(s), , drop = FALSE],
ydir[seq_len(s), , drop = FALSE])
}, XDirs, YDirs)
list(
eta1 = rowMeans(X, dims = r),
alphas = alphas,
Omegas = Map(diag, p)
)
}
# parameters of the tensor normal computed from the GLM parameters
params <- function(Fy, eta1, alphas, Omegas) {
Deltas <- Map(solve, Omegas)
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
list(mu_y = mu_y, Deltas = Deltas)
}
# scaled negative log-likelihood
log.likelihood <- function(X, Fy, eta1, alphas, Omegas) {
n <- tail(dim(X), 1) # sample size
# conditional mean
mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas))
# negative log-likelihood scaled by sample size
# Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)`
# migth fail, but `NAGD` has failsaves againt cases of "illegal"
# parameters.
suppressWarnings(
0.5 * prod(p) * log(2 * pi) +
sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) -
(0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p)
)
}
# gradient of the scaled negative log-likelihood
grad <- function(X, Fy, eta1, alphas, Omegas) {
# retrieve dimensions
n <- tail(dim(X), 1) # sample size
p <- head(dim(X), -1) # predictor dimensions
r <- length(p) # single predictor/response tensor order
## "Inverse" Link: Tensor Normal Specific
# known exponential family constants
c1 <- 1
c2 <- -0.5
# Covariances from the GLM parameter Scatter matrices
Deltas <- Map(solve, Omegas)
# First moment via "inverse" link `g1(eta_y) = E[X | Y = y]`
E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
# Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]`
dim(E1) <- c(prod(p), n)
E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1))
## end "Inverse" Link
dim(X) <- c(prod(p), n)
# Residuals
R <- X - E1
dim(R) <- c(p, n)
# mean deviation between the sample covariance to GLM estimated covariance
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
S <- rowMeans(colKronecker(X, X)) - E2 # <- Optimized for Tensor Normal
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
# Gradients of the negative log-likelihood scaled by sample size
list(
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
"Dl(alphas)" = Map(function(j) {
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
}, 1:r),
"Dl(Omegas)" = Map(function(j) {
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
# addapt to symmetric constraint for the derivative
dim(deriv) <- c(p[j], p[j])
deriv + t(deriv * (1 - diag(p[j])))
}, 1:r)
)
}
# mean conditional Fisher Information
fisher.info <- function(Fy, eta1, alphas, Omegas) {
# retrieve dimensions
n <- tail(dim(Fy), 1) # sample size
p <- dim(eta1) # predictor dimensions
q <- head(dim(Fy), -1) # response dimensions
r <- length(p) # single predictor/response tensor order
# Hij = Cov(ti(X) %x% tj(X) | Y = y), i, j = 1, 2
H11 <- Reduce(`%x%`, rev(Map(solve, Omegas))) # covariance
# 3rd central moment is zero
H12 <- H21 <- 0
# 4th moment by "Isserlis' theorem" a.k.a. "Wick's theorem"
H22 <- kronecker(H11, H11)
dim(H22) <- rep(prod(p), 4)
H22 <- H22 + aperm(H22, c(1, 3, 2, 4)) + aperm(H22, c(1, 3, 2, 4))
dim(H11) <- c(p, p)
dim(H22) <- c(p, p, p, p)
## Fisher Information: Tensor Normal Specific
# known exponential family constants
c1 <- 1
c2 <- -0.5
# list of (Fy x_{k in [r]\j} alpha_j)
Gys <- Map(function(j) {
mlm(Fy, alphas[-j], (1:r)[-j])
}, 1:r)
# setup indices of lower triangular matrix elements used for
# all tuples (j, l) with 1 <= j <= l <= r.
ltri <- lower.tri(diag(r), TRUE)
J <- .col(c(r, r))[ltri]
L <- .row(c(r, r))[ltri]
# inverse perfect outer shuffle of `2 r` elements
iShuf <- rep(1:r, each = 2) + c(0, r)
# Getter for the i'th observation from Gys[[j]]
# TODO: this is ugly, find a better (but still dynamic) version
GyGet <- function(i, j) {
obs <- eval(str2lang(paste(
"Gys[[j]][",
paste(rep(",", r), collapse = ""),
"i, drop = FALSE]"
, collapse = "")))
dim(obs) <- head(dim(obs), -1)
obs
}
I11 <- c1^2 * mat(H11, 1:r)
I12 <- Map(function(j) {
i11 <- ttt(Gys[[j]], H11, (1:r)[-j])
# take mean with respect to observations (second mode)
i11 <- colMeans(mat(i11, 2))
dim(i11) <- c(q[j], p[j], p)
t(mat(i11, 2:1))
}, 1:r)
I13 <- Map(matrix, 0, prod(p), p^2)
I22 <- Map(function(j, l) {
i22 <- 0
for (i in 1:n) {
i22 <- i22 + ttt(
ttt(GyGet(i, j), H11, (1:r)[-j]),
GyGet(i, l), (1:r)[-l] + 2, (1:r)[-l])
}
(c1^2 / n) * mat(i22, 2:1)
}, J, L)
I23 <- Map(matrix, 0, (p * q)[J], (p^2)[L])
# shuffled H22
sH22 <- c2^2 * aperm(H22, c(iShuf, iShuf + 2 * r))
dim(sH22) <- c(p^2, p^2)
I33 <- Map(function(j, l) {
# second derivative (without constraints)
deriv <- mlm(mlm(sH22, Map(c, Omegas[-j]), (1:r)[-j],
transposed = TRUE),
Map(c, Omegas[-l]), (1:r)[-l] + r,
transposed = TRUE)
dim(deriv) <- c(p[j], p[j], p[l], p[l])
# enforce symmetry of `Omega_j`
of.diag <- (slice.index(deriv, 1:2) != slice.index(deriv, 2:1))
deriv <- deriv + of.diag * aperm(deriv, c(2, 1, 3, 4))
# as well as the symmetry of `Omega_l`
of.diag <- (slice.index(deriv, 3:4) != slice.index(deriv, 4:3))
deriv <- deriv + of.diag * aperm(deriv, c(1, 2, 4, 3))
# matricize and return
dim(deriv) <- c(p[j]^2, p[l]^2)
deriv
}, J, L)
names(I12) <- sprintf("I(eta1, alpha_%d)", 1:r)
names(I13) <- sprintf("I(eta1, Omega_%d)", 1:r)
names(I22) <- sprintf("I(alpha_%d, alpha_%d)", J, L)
names(I23) <- sprintf("I(alpha_%d, Omega_%d)", J, L)
names(I33) <- sprintf("I(Omega_%d, Omega_%d)", J, L)
list(I11 = I11, I12 = I12, I13 = I13, I22 = I22, I23 = I23, I33 = I33)
}
# Hessian of the scaled negative log-likelihood
hessian <- function(X, Fy, eta1, alphas, Omegas) {
stop("Not Implemented")
}
},
########################################################################
### Multi-Variate Bernoulli ###
########################################################################
bernoulli = {
require(mvbernoulli)
initialize <- function(X, Fy) {
p <- head(dim(X), -1)
r <- length(p)
# Mode-Covariances
XSigmas <- mcov(X, sample.axis = r + 1L)
YSigmas <- mcov(Fy, sample.axis = r + 1L)
# Extract main mode covariance directions
# Note: (the directions are transposed!)
XDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, XSigmas)
YDirs <- Map(function(Sigma) {
SVD <- La.svd(Sigma, nu = 0)
sqrt(SVD$d) * SVD$vt
}, YSigmas)
alphas <- Map(function(xdir, ydir) {
s <- min(ncol(xdir), nrow(ydir))
crossprod(xdir[seq_len(s), , drop = FALSE],
ydir[seq_len(s), , drop = FALSE])
}, XDirs, YDirs)
list(
eta1 = array(0, dim = p),
alphas = alphas,
Omegas = Map(diag, p)
)
}
params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
# number of observations
n <- tail(dim(Fy), 1)
# natural exponential family parameters
eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1))
eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas))
# # next the conditional Ising model parameters `theta_y`
# theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n)
# dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n)
# ltri <- which(lower.tri(eta_y2, diag = TRUE))
# diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
# theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1)
# theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ]
# conditional Ising model parameters
theta_y <- matrix(rep(vech(eta_y2), n), ncol = n)
ltri <- which(lower.tri(eta_y2, diag = TRUE))
diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
theta_y[diagonal, ] <- eta_y1
theta_y
}
# Scaled ngative log-likelihood
log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
# number of observations
n <- tail(dim(X), 1L)
# conditional Ising model parameters
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
# convert to binary data set
storage.mode(X) <- "integer"
X.mvb <- as.mvbinary(mat(X, length(dim(X))))
# log-likelihood of the data set
-mean(sapply(seq_len(n), function(i) {
ising_log_likelihood(theta_y[, i], X.mvb[i])
}))
}
# Gradient of the scaled negative log-likelihood
grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
# retrieve dimensions
n <- tail(dim(X), 1) # sample size
p <- head(dim(X), -1) # predictor dimensions
r <- length(p) # single predictor/response tensor order
## "Inverse" Link: Ising Model Specific
# conditional Ising model parameters: `p (p + 1) / 2` by `n`
theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
# conditional expectations
# ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y]
E2 <- apply(theta_y, 2L, ising_marginal_probs)
# convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y]
E2 <- E2[vech.pinv.index(prod(p)), ]
# extract diagonal elements which are equal to E[vec(X) | Y = y]
E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ]
## end "Inverse" Link
dim(X) <- c(prod(p), n)
# Residuals
R <- X - E1
dim(R) <- c(p, n)
# mean deviation between the sample covariance to GLM estimated covariance
# `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
S <- rowMeans(colKronecker(X, X) - E2)
dim(S) <- c(p, p) # reshape to tensor or order `2 r`
# Gradients of the negative log-likelihood scaled by sample size
list(
"Dl(eta1)" = -c1 * rowMeans(R, dims = r),
"Dl(alphas)" = Map(function(j) {
(-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
}, 1:r),
"Dl(Omegas)" = Map(function(j) {
deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
# addapt to symmetric constraint for the derivative
dim(deriv) <- c(p[j], p[j])
deriv + t(deriv * (1 - diag(p[j])))
}, 1:r)
)
}
# Hessian of the scaled negative log-likelihood
hessian <- function(X, Fy, alphas, Omegas, c1 = 1, c2 = 1) {
stop("Not Implemented")
}
# Conditional Fisher Information
fisher.info <- function(Fy, alphas, Omegas) {
stop("Not Implemented")
}
}
)
list(
family = name,
initialize = initialize,
params = params,
# linkinv = linkinv,
log.likelihood = log.likelihood,
grad = grad,
hessian = hessian,
fisher.info = fisher.info
)
}

View File

@ -15,35 +15,43 @@
#' #'
#' @export #' @export
matrixImage <- function(A, add.values = FALSE, matrixImage <- function(A, add.values = FALSE,
main = NULL, sub = NULL, interpolate = FALSE, ..., main = NULL, sub = NULL, interpolate = FALSE, ..., zlim = NA,
axes = TRUE, asp = 1, col = hcl.colors(24, "YlOrRd", rev = FALSE),
digits = getOption("digits") digits = getOption("digits")
) { ) {
# plot raster image # plot raster image
plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "red", plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "black",
xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main, sub = sub) xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main, sub = sub,
asp = asp)
# Scale values of `A` to [0, 1] with min mapped to 1 and max to 0. # Scale values of `A` to [0, 1] with min mapped to 1 and max to 0.
S <- (max(A) - A) / diff(range(A)) if (missing(zlim)) {
S <- (max(A) - A) / diff(range(A))
} else {
S <- pmin(pmax(0, (zlim[2] - A) / diff(zlim)), 1)
}
# and not transform to color
S <- matrix(col[round((length(col) - 1) * S + 1)], nrow = nrow(A))
# Raster Image ploting the matrix with element values mapped to grayscale # Raster Image ploting the matrix with element values mapped to grayscale
# as big elements (original matrix A) are dark and small (negative) elements # as big elements (original matrix A) are dark and small (negative) elements
# are white. # are white.
rasterImage(S, 0, 0, ncol(A), nrow(A), interpolate = interpolate, ...) rasterImage(S, 0, 0, ncol(A), nrow(A), interpolate = interpolate, ...)
# Add X-axis giving index # X/Y axes index (matches coordinates to matrix indices)
x <- seq(1, ncol(A), by = 1) x <- seq(1, ncol(A), by = 1)
axis(1, at = x - 0.5, labels = x, lwd = 0, lwd.ticks = 1)
# Add Y-axis
y <- seq(1, nrow(A)) y <- seq(1, nrow(A))
axis(2, at = y - 0.5, labels = rev(y), lwd = 0, lwd.ticks = 1, las = 1) if (axes) {
axis(1, at = x - 0.5, labels = x, lwd = 0, lwd.ticks = 1)
axis(2, at = y - 0.5, labels = rev(y), lwd = 0, lwd.ticks = 1, las = 1)
}
# Writes matrix values (in colored element grids) # Writes matrix values
if (any(add.values)) { if (any(add.values)) {
if (length(add.values) > 1) { if (length(add.values) > 1) {
A[!add.values] <- NA A[!add.values] <- NA
A[add.values] <- format(A[add.values], digits = digits) A[add.values] <- format(A[add.values], digits = digits)
} }
text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A, text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A, adj = 0.5)
adj = 0.5, col = as.integer(S > 0.65))
} }
} }

View File

@ -3,34 +3,101 @@
#' @example inst/examples/num_deriv.R #' @example inst/examples/num_deriv.R
#' #'
#' @export #' @export
num.deriv <- function(F, X, h = 1e-6, sym = FALSE) { num.deriv <- function(expr, ..., h = 1e-6, sym = FALSE) {
sexpr <- substitute(expr)
if (...length() != 1) {
stop("Expectd exactly one '...' variable")
}
var <- ...names()[1]
if (is.null(var)) {
arg <- substitute(...)
var <- if (is.symbol(arg)) as.character(arg) else "x"
}
if (is.language(sexpr) && !is.symbol(sexpr) && sexpr[[1]] == as.symbol("function")) {
func <- expr
} else {
if (is.name(sexpr)) {
expr <- call(as.character(sexpr), as.name(var))
} else {
if ((!is.call(sexpr) && !is.expression(sexpr))
|| !(var %in% all.vars(sexpr))) {
stop("'expr' must be a function or expression containing '", var, "'")
}
expr <- sexpr
}
args <- as.pairlist(structure(list(alist(x = )[[1]]), names = var))
func <- as.function(c(args, expr), envir = parent.frame())
}
num.deriv.function(func, ..1, h = h, sym = sym)
}
#' @rdname num.deriv
#' @export
num.deriv.function <- function(FUN, X, h = 1e-6, sym = FALSE) {
if (sym) { if (sym) {
stopifnot(isSymmetric(X)) stopifnot(isSymmetric(X))
p <- nrow(X) p <- nrow(X)
k <- seq_along(X) - 1 k <- seq_along(X) - 1
mapply(function(i, j) { mapply(function(i, j) {
dx <- h * ((k == i * p + j) | (k == j * p + i)) dx <- h * ((k == i * p + j) | (k == j * p + i))
(F(X + dx) - F(X - dx)) / (2 * h) (FUN(X + dx) - FUN(X - dx)) / (2 * h)
}, .row(dim(X)) - 1, .col(dim(X)) - 1) }, .row(dim(X)) - 1, .col(dim(X)) - 1)
} else { } else {
sapply(seq_along(X), function(i) { sapply(seq_along(X), function(i) {
dx <- h * (seq_along(X) == i) dx <- h * (seq_along(X) == i)
(F(X + dx) - F(X - dx)) / (2 * h) (FUN(X + dx) - FUN(X - dx)) / (2 * h)
}) })
} }
} }
#' Second numeric derivative #' @rdname num.deriv
#'
#' @export #' @export
num.deriv2 <- function(F, X, Y, h = 1e-6, symX = FALSE, symY = FALSE) { num.deriv2 <- function(FUN, X, Y, h = 1e-6, symX = FALSE, symY = FALSE) {
if (missing(Y)) { if (missing(Y)) {
num.deriv(function(x) { num.deriv.function(function(x) {
num.deriv(function(z) F(z), x, h = h, sym = symX) num.deriv.function(FUN, x, h = h, sym = symX)
}, X, h = h, sym = symX) }, X, h = h, sym = symX)
} else { } else {
num.deriv(function(y) { num.deriv.function(function(y) {
num.deriv(function(x) F(x, y), X, h = h, sym = symX) num.deriv.function(function(x) FUN(x, y), X, h = h, sym = symX)
}, Y, h = h, sym = symY) }, Y, h = h, sym = symY)
} }
} }
### Interface Idea / Demo
# num.deriv2.function
# num.deriv2 <- function(expr, var) {
# sexpr <- substitute(expr)
# svar <- substitute(var)
#
# if (is.language(sexpr) && !is.symbol(sexpr) && sexpr[[1]] == as.symbol("function")) {
# func <- expr
# } else {
# if (is.name(sexpr)) {
# expr <- call(as.character(sexpr), as.name(svar))
# } else {
# if ((!is.call(sexpr) && !is.expression(sexpr))
# || !(as.character(svar) %in% all.vars(sexpr))) {
# stop("'expr' must be a function or expression containing '",
# as.character(svar), "'")
# }
# expr <- sexpr
# }
#
# args <- as.pairlist(structure(list(alist(x = )[[1]]), names = as.character(svar)))
# func <- as.function(c(args, expr), envir = parent.frame())
# }
#
# num.deriv2.function(func, var)
# }
# y <- c(pi, exp(1), (sqrt(5) + 1) / 2)
# num.deriv2(function(x) x^2 + y, x)(1:3)
# num.deriv2(x^2 + y, x)(1:3)
# func <- function(x) x^2 + y
# num.deriv2(func, x)(1:3)
# func2 <- function(z = y, x) x^2 + z
# num.deriv2(func2, x)(1:3)

View File

@ -4,15 +4,17 @@ A <- A + t(A)
B <- matrix(rnorm(3 * 4), 3, 4) B <- matrix(rnorm(3 * 4), 3, 4)
# F(A) = A B # F(A) = A B
# DF(A) = B' x I # DF(A) = B' %x% I
stopifnot(all.equal( exprs.all.equal({
num.deriv(function(X) X %*% B, A),
t(B) %x% diag(nrow(A)) t(B) %x% diag(nrow(A))
)) num.deriv(function(X) X %*% B, A)
num.deriv(A %*% B, A)
num.deriv(X %*% B, X = A)
})
# Symmetric case, constraint A = A' (equiv to being a function of vech(A) only) # Symmetric case, constraint A = A' (equiv to being a function of vech(A) only)
# F(A) = A B for A = A' # F(A) = A B for A = A'
# DF(A) = B' x I # DF(A) = B' %x% I
stopifnot(all.equal( stopifnot(all.equal(
num.deriv(function(X) X %*% B, A, sym = TRUE), num.deriv(function(X) X %*% B, A, sym = TRUE),
(t(B) %x% diag(nrow(A))) %*% D(nrow(A)) %*% t(D(nrow(A))) (t(B) %x% diag(nrow(A))) %*% D(nrow(A)) %*% t(D(nrow(A)))

View File

@ -0,0 +1,4 @@
library(testthat)
library(tensorPredictors)
test_check("tensorPredictors")

View File

@ -0,0 +1,50 @@
# F(X) = log(det(X)) if det(X) > 0
# dF(X) = tr(X^-1 dX)
# DF(X) = vec((X^-1)')'
test_that("Matrix Calculus 1", {
# test data
X <- tcrossprod(diag(5) + matrix(runif(5^2, -.1, .1), 5))
num.grad <- num.deriv(function(X) log(det(X)), X)
ana.grad <- c(solve(X))
expect_equal(num.grad, ana.grad)
})
# F(mu) = <X - mu, (X - mu) x_{k in [r]} Delta_K^-1> for Delta_k = Delta_k'
# DF(mu) = -2 vec((X - mu) x_{k in [r]} Delta_K^-1)'
test_that("Matrix Calculus 2", {
p <- c(2, 3, 5)
X <- array(rnorm(prod(p)), dim = p)
mu <- array(rnorm(prod(p)), dim = p)
Deltas <- Map(function(pj) tcrossprod(diag(pj) + runif(pj^2, -0.1, 0.1)), p)
ana.grad <- -2 * c(mlm(X - mu, Map(solve, Deltas)))
num.grad <- num.deriv(function(mu) sum((X - mu) * mlm(X - mu, Map(solve, Deltas))), mu)
expect_equal(num.grad, ana.grad)
})
# F(Delta_j) = <X - mu, (X - mu) x_{k in [r]} Delta_K^-1> for Delta_k = Delta_k'
# DF(Delta_j) = -((X - mu) x_{k in [r]} Delta_K^-1)_(j) (X - mu)_(j)' Delta_j^-1
test_that("Matrix Calculus 3", {
# config
p <- c(2, 3, 5)
# generate test data
X <- array(rnorm(prod(p)), dim = p)
mu <- array(rnorm(prod(p)), dim = p)
Deltas <- Map(function(pj) tcrossprod(diag(pj) + runif(pj^2, -0.1, 0.1)), p)
# check analytic to numeric derivatives
for (j in seq_along(Deltas)) {
num.grad <- matrix(num.deriv(function(Delta_j) {
Deltas[[j]] <- Delta_j
sum((X - mu) * mlm(X - mu, Map(solve, Deltas)))
}, Deltas[[j]]), p[j])
ana.grad <- -mcrossprod(mlm(X - mu, Map(solve, Deltas)), X - mu, j) %*% solve(Deltas[[j]])
expect_equal(num.grad, ana.grad)
}
})

View File

@ -0,0 +1,186 @@
test_that("Tensor-Normal Score", {
# setup dimensions
n <- 11L
p <- c(3L, 5L, 4L)
q <- c(2L, 7L, 3L)
# create "true" GLM parameters
eta1 <- array(rnorm(prod(p)), dim = p)
alphas <- Map(matrix, Map(rnorm, p * q), p)
Omegas <- Map(function(pj) {
solve(0.5^abs(outer(seq_len(pj), seq_len(pj), `-`)))
}, p)
params <- list(eta1, alphas, Omegas)
# compute tensor normal parameters from GLM parameters
Deltas <- Map(solve, Omegas)
mu <- mlm(eta1, Deltas)
# sample some test data
sample.axis <- length(p) + 1L
Fy <- array(rnorm(n * prod(q)), dim = c(q, n))
X <- mlm(Fy, Map(`%*%`, Deltas, alphas)) + rtensornorm(n, mu, Deltas, sample.axis)
# Create a GLM family object
family <- make.gmlm.family("normal")
# first unpack the family object
log.likelihood <- family$log.likelihood
grad <- family$grad
# compare numeric gradient against the analytic gradient provided by the
# family at the initial parameters
grad.num <- do.call(function(eta1, alphas, Omegas) {
list(
"Dl(eta1)" = num.deriv.function(function(eta1) {
log.likelihood(X, Fy, eta1, alphas, Omegas)
}, eta1),
"Dl(alphas)" = Map(function(j) num.deriv.function(function(alpha_j) {
alphas[[j]] <- alpha_j
log.likelihood(X, Fy, eta1, alphas, Omegas)
}, alphas[[j]]), seq_along(alphas)),
"Dl(Omegas)" = Map(function(j) num.deriv.function(function(Omega_j) {
Omegas[[j]] <- Omega_j
log.likelihood(X, Fy, eta1, alphas, Omegas)
}, Omegas[[j]], sym = TRUE), seq_along(Omegas))
)
}, params)
grad.ana <- do.call(grad, c(list(X, Fy), params))
expect_equal(RMap(c, grad.ana), grad.num, tolerance = 1e-6)
# test for correct dimensions
ana.dims <- list(
"Dl(eta1)" = p,
"Dl(alphas)" = Map(c, p, q),
"Dl(Omegas)" = Map(c, p, p)
)
expect_identical(RMap(dim, grad.ana), ana.dims)
})
test_that("Tensor-Normal moments of the sufficient statistic t(X)", {
# config
# # (estimated) sample size for sample estimates
# n <- 250000 # sample estimates are too unreliable for testing purposes
p <- c(2L, 3L)
r <- length(p)
# setup tensor normal parameters (NO dependence of Fy, set to zero)
mu <- array(runif(prod(p), min = -0.5, max = 0.5), dim = p) # = mu_y
Deltas <- Map(function(pj) 0.5^abs(outer(1:pj, 1:pj, `-`)), p)
# compute GMLM parameters (NO alphas as Fy is zero)
Omegas <- Map(solve, Deltas)
eta1 <- mlm(mu, Omegas)
# natural parameters of the tensor normal
eta_y1 <- eta1 # + mlm(Fy, alphas) = + 0
eta_y2 <- -0.5 * Reduce(`%x%`, rev(Omegas))
# draw a sample
X <- rtensornorm(n, mu, Deltas, r + 1L)
# tensor normal log-partition function given eta_y
log.partition <- function(eta_y1, eta_y2) {
# eta_y1 as "model" matrix of dimensions `n by p` where `n` might be `1`
if (length(eta_y1)^2 == length(eta_y2)) {
pp <- length(eta_y1)
dim(eta_y1) <- c(1L, pp)
} else {
eta_y1 <- mat(eta_y1, r + 1L)
pp <- ncol(eta_y1)
}
# treat eta_y2 as square matrix
if (!is.matrix(eta_y2)) {
dim(eta_y2) <- c(pp, pp)
}
# evaluate log-partiton function in terms of natural parameters
-0.25 * pp * mean((eta_y1 %*% solve(eta_y2)) * eta_y1) - 0.5 * log(det(-2 * eta_y2))
}
# (analytic) first and second (un-centered) moment of the tensor normal
# Eti = E[ti(X) | Y = y]
Et1.ana <- mu
Et2.ana <- Reduce(`%x%`, rev(Deltas)) + outer(c(mu), c(mu))
# (numeric) derivative of the log-partition function with respect to the natural
# parameters of the exponential family form of the tensor normal
Et1.num <- num.deriv(log.partition(eta_y1, eta_y2), eta_y1)
Et2.num <- num.deriv(log.partition(eta_y1, eta_y2), eta_y2)
# # (estimated) estimate from sample
# Et1.est <- rowMeans(X, dims = r)
# Et2.est <- colMeans(rowKronecker(mat(X, r + 1L), mat(X, r + 1L)))
# (analytic) convariance blocks of the sufficient statistic
# Ctij = Cov(ti(X), tj(X) | Y = y)
Ct11.ana <- Reduce(`%x%`, rev(Deltas))
Ct12.ana <- local({
# Analytic solution / `vec(mu %x% Delta) + vec(mu' %x% Delta)`
ct12 <- 2 * c(mu) %x% Reduce(`%x%`, rev(Deltas))
# and symmetrize!
dim(ct12) <- rep(prod(p), 3)
(1 / 2) * (ct12 + aperm(ct12, c(3, 1, 2)))
})
Ct22.ana <- local({
Delta <- Reduce(`%x%`, rev(Deltas)) # Ct11.ana
ct22 <- (2 * Delta + 4 * outer(c(mu), c(mu))) %x% Delta
# TODO: What does this symmetrization do exactly? And why?!
dim(ct22) <- rep(prod(p), 4)
(1 / 4) * (
aperm(ct22, c(2, 3, 1, 4)) +
aperm(ct22, c(2, 3, 4, 1)) +
aperm(ct22, c(3, 2, 1, 4)) +
aperm(ct22, c(3, 2, 4, 1))
)
})
# (numeric) second derivative of the log-partition function
Ct11.num <- num.deriv2(function(eta_y1) log.partition(eta_y1, eta_y2), eta_y1)
Ct12.num <- num.deriv2(log.partition, eta_y1, eta_y2)
Ct22.num <- local({
ct22 <- num.deriv2(function(eta_y2) log.partition(eta_y1, eta_y2), eta_y2)
# TODO: Why does this need to be symmetrized?!
dim(ct22) <- rep(prod(p), 4)
(1 / 4) * (
aperm(ct22, c(3, 4, 2, 1)) +
aperm(ct22, c(4, 3, 2, 1)) +
aperm(ct22, c(3, 4, 1, 2)) +
aperm(ct22, c(4, 3, 1, 2))
)
})
# # (estimated) sample estimates of the sufficient statistic convariance blocks
# T1 <- mat(X, r + 1L)
# T2 <- rowKronecker(T1, T1)
# Ct11.est <- cov(T1)
# Ct12.est <- cov(T1, T2)
# Ct22.est <- cov(T2, T2)
tolerance <- 1e-3
expect_equal(c(Et1.ana), c(Et1.num), tolerance = tolerance)
# expect_equal(c(Et1.ana), c(Et1.est), tolerance = 0.05, scale = 1)
# expect_equal(c(Et1.num), c(Et1.est), tolerance = 0.05, scale = 1)
expect_equal(c(Et2.ana), c(Et2.num), tolerance = tolerance)
# expect_equal(c(Et2.ana), c(Et2.est), tolerance = 0.05, scale = 1)
# expect_equal(c(Et2.num), c(Et2.est), tolerance = 0.05, scale = 1)
expect_equal(c(Ct11.ana), c(Ct11.num), tolerance = tolerance)
# expect_equal(c(Ct11.ana), c(Ct11.est), tolerance = 0.05, scale = 1)
# expect_equal(c(Ct11.num), c(Ct11.est), tolerance = 0.05, scale = 1)
expect_equal(c(Ct12.ana), c(Ct12.num), tolerance = tolerance)
# expect_equal(c(Ct12.ana), c(Ct12.est), tolerance = 0.05, scale = 1)
# expect_equal(c(Ct12.num), c(Ct12.est), tolerance = 0.05, scale = 1)
expect_equal(c(Ct22.ana), c(Ct22.num), tolerance = tolerance)
# expect_equal(c(Ct22.ana), c(Ct22.est), tolerance = 0.05, scale = 1)
# expect_equal(c(Ct22.num), c(Ct22.est), tolerance = 0.05, scale = 1)
})