diff --git a/LaTeX/GMLM.tex b/LaTeX/GMLM.tex index 73f3d6f..f621ad1 100644 --- a/LaTeX/GMLM.tex +++ b/LaTeX/GMLM.tex @@ -139,11 +139,12 @@ \maketitle + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{abstract} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - We consider regression and classification for \textit{general} response and tensor-valued predictors (multi dimensional arrays) and propose a \textit{novel formulation} for method for sufficient dimension reduction. Assumingof Tensor-valued predictor (multi dimensional arrays) for regression or classification. the distribution of the tensor-valued predictors given the response is in theWe assume an qQuadratic eExponential fFamily, we model the natural parameter for a Generalized Linear Model in an inverse regression setting where the relation via a link is of as a multi-linear function of the responsnature. - ThisUsing a multi-linear relation allows to perform per-axis reductions that drastically which reduces the total number of parameters drastically for higher order tTensor-valued predictors. WUnder the Exponential Family we derive maximum likelihood estimates for the multi-linear sufficient dimension reduction and of the Tensor-valued predictors. Furthermore, we provide a computationally efficient n estimation algorithm which leveragutilizes the tTensor structure allowing efficient implementations. The performance of the method is illustrated via simulations and real world examples are provided. + We consider regression and classification for \textit{general} response and tensor-valued predictors (multi dimensional arrays) and propose a \textit{novel formulation} for sufficient dimension reduction. Assuming the distribution of the tensor-valued predictors given the response is in the quadratic exponential family, we model the natural parameter as a multi-linear function of the respons. + This allows per-axis reductions that drastically reduce the total number of parameters for higher order tensor-valued predictors. We derive maximum likelihood estimates for the sufficient dimension reduction and a computationally efficient estimation algorithm which leveraes the tensor structure. The performance of the method is illustrated via simulations and real world examples are provided. \end{abstract} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -523,13 +524,13 @@ The next step is to identify the Hessians from the second differentials in a sim &\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \vec\biggl( (\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j) \Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))} \biggr) \\ &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big)_{(j)}\otimes\mat{I}_{p_j} \biggr) \mat{K}_{\mat{p},(j)}\mat{H}_{1,1}\t{\mat{K}_{\mat{p},(l)}} \biggl( \t{\Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big)_{(l)}}\otimes\mat{I}_{p_l} \biggr)\vec{\d\mat{\alpha}_l} \\ &\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \biggl( \t{\Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))}}\otimes\mat{I}_{p_j p_l} \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\ - &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \vec{\d\mat{\alpha}_l} \\ + &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \vec{\d\mat{\alpha}_l} \\ &\qquad + c_1 \vec\biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\ - &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \vec{\d\mat{\alpha}_l} \\ + &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \vec{\d\mat{\alpha}_l} \\ &\qquad + c_1 \t{(\vec{\d\mat{\alpha}_j})} \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3))} \vec{\d\mat{\alpha}_l} \\ &\qquad \begin{aligned} \Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\alpha}_l})}} &= - -c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \\ + -c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\ &\qquad + c_1 \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3) + [[j > l]])} \qquad{\color{gray} (p_j q_j \times p_l q_l)} \end{aligned} @@ -591,22 +592,23 @@ and with that all auxiliary calculation are done. Using the two expectations yie \mathcal{I}_{2 r + 1,1} & \mathcal{I}_{2 r + 1,1} & \cdots & \mathcal{I}_{2 r + 1, 2 r + 1} \end{pmatrix} \end{displaymath} -for $j, l = 1, ..., r$ as follows +and for every block holds $\mathcal{I}_{j, l} = \t{\mathcal{I}_{l, j}}$. The individual blocks are given with $j = 1, ..., r$ and $j \leq l \leq r$ by \begin{align*} \mathcal{I}_{1,1} &= c_1^2 (\ten{H}_{1,1})_{([r])} \\ - \mathcal{I}_{1,j+1} = \t{\mathcal{I}_{j+1,1}} + \mathcal{I}_{1,j+1} % = \E\partial_{\vec{\overline{\ten{\eta}}_1}}\partial_{\t{(\vec{\mat{\alpha}_j})}} l(\mat{\Theta})\mid \ten{Y} = y &= c_1^2 \Big[\Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1}\Big]_{((2, 1))} \\ - \mathcal{I}_{1,j+1} = \t{\mathcal{I}_{j+1,1}} + \mathcal{I}_{1,j+r+1} &= c_1 c_2 \Big( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)} \\ - \mathcal{I}_{j+1,l+1} = \t{\mathcal{I}_{l+1,j+1}} + \mathcal{I}_{j+1,l+1} &= c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\ - \mathcal{I}_{j+1,l+r+1} = \t{\mathcal{I}_{l+r+1,j+1}} + \mathcal{I}_{j+1,l+r+1} &= c_1 c_2 \biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{((r + 2, 1))} \\ - \mathcal{I}_{j+r+1,l+r+1} = \t{\mathcal{I}_{l+r+1,j+r+1}} + \mathcal{I}_{j+r+1,l+r+1} &= c_2^2 \Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)} \end{align*} + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Vectorization and Matricization} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -615,7 +617,7 @@ The \emph{matricization} is a generalization of the \emph{vectorization} operati \begin{theorem}\label{thm:mlm_mat} Let $\ten{A}$ be a tensor of order $r$ with the dimensions $q_1\times ... \times q_r$. Furthermore, let for $k = 1, ..., r$ be $\mat{B}_k$ matrices of dimensions $p_k\times q_k$. Then, for any $(\mat{i}, \mat{j})\in\perm{r}$ holds \begin{displaymath} - (\ten{A}\times_{k\in[r]}\mat{B}_k)_{(\mat{i}, \mat{j})} + \Big(\ten{A}\mlm{k\in[r]}\mat{B}_k\Big)_{(\mat{i}, \mat{j})} = \Big(\bigotimes_{k = \len{\mat{i}}}^{1}\mat{B}_{\mat{i}_k}\Big) \ten{A}_{(\mat{i}, \mat{j})} \Big(\bigotimes_{k = \len{\mat{j}}}^{1}\t{\mat{B}_{\mat{j}_k}}\Big). \end{displaymath} \end{theorem} diff --git a/tensorPredictors/NAMESPACE b/tensorPredictors/NAMESPACE index efe36de..b97d6ab 100644 --- a/tensorPredictors/NAMESPACE +++ b/tensorPredictors/NAMESPACE @@ -47,6 +47,7 @@ export(mkm) export(mlm) export(mtvk) export(num.deriv) +export(num.deriv.function) export(num.deriv2) export(reduce) export(rowKronecker) diff --git a/tensorPredictors/R/GMLM.R b/tensorPredictors/R/GMLM.R index 7a442bf..f80c5d3 100644 --- a/tensorPredictors/R/GMLM.R +++ b/tensorPredictors/R/GMLM.R @@ -5,265 +5,6 @@ GMLM <- function(...) { stop("Not Implemented") } - -make.gmlm.family <- function(name) { - # standardize family name - name <- list( - normal = "normal", gaussian = "normal", - bernoulli = "bernoulli", ising = "bernoulli" - )[[tolower(name), exact = FALSE]] - - ############################################################################ - # # - # TODO: better (and possibly specialized) initial parameters!?!?!?! # - # # - ############################################################################ - - switch(name, - normal = { - initialize <- function(X, Fy) { - p <- head(dim(X), -1) - r <- length(p) - - # Mode-Covariances - XSigmas <- mcov(X, sample.axis = r + 1L) - YSigmas <- mcov(Fy, sample.axis = r + 1L) - - # Extract main mode covariance directions - # Note: (the directions are transposed!) - XDirs <- Map(function(Sigma) { - SVD <- La.svd(Sigma, nu = 0) - sqrt(SVD$d) * SVD$vt - }, XSigmas) - YDirs <- Map(function(Sigma) { - SVD <- La.svd(Sigma, nu = 0) - sqrt(SVD$d) * SVD$vt - }, YSigmas) - - alphas <- Map(function(xdir, ydir) { - s <- min(ncol(xdir), nrow(ydir)) - crossprod(xdir[seq_len(s), , drop = FALSE], - ydir[seq_len(s), , drop = FALSE]) - }, XDirs, YDirs) - - list( - eta1 = rowMeans(X, dims = r), - alphas = alphas, - Omegas = Map(diag, p) - ) - } - - # parameters of the tensor normal computed from the GLM parameters - params <- function(Fy, eta1, alphas, Omegas) { - Deltas <- Map(solve, Omegas) - mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas) - list(mu_y = mu_y, Deltas = Deltas) - } - - # scaled negative log-likelihood - log.likelihood <- function(X, Fy, eta1, alphas, Omegas) { - n <- tail(dim(X), 1) # sample size - - # conditional mean - mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas)) - - # negative log-likelihood scaled by sample size - # Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)` - # migth fail, but `NAGD` has failsaves againt cases of "illegal" - # parameters. - suppressWarnings( - 0.5 * prod(p) * log(2 * pi) + - sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) - - (0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p) - ) - } - # gradient of the scaled negative log-likelihood - grad <- function(X, Fy, eta1, alphas, Omegas) { - # retrieve dimensions - n <- tail(dim(X), 1) # sample size - p <- head(dim(X), -1) # predictor dimensions - r <- length(p) # single predictor/response tensor order - - ## "Inverse" Link: Tensor Normal Specific - # known exponential family constants - c1 <- 1 - c2 <- -0.5 - - # Covariances from the GLM parameter Scatter matrices - Deltas <- Map(solve, Omegas) - - # First moment via "inverse" link `g1(eta_y) = E[X | Y = y]` - E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas) - - # Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]` - dim(E1) <- c(prod(p), n) - E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1)) - ## end "Inverse" Link - - dim(X) <- c(prod(p), n) - - # Residuals - R <- X - E1 - dim(R) <- c(p, n) - - # mean deviation between the sample covariance to GLM estimated covariance - # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))` - S <- rowMeans(colKronecker(X, X)) - E2 # <- Optimized for Tensor Normal - dim(S) <- c(p, p) # reshape to tensor or order `2 r` - - # Gradients of the negative log-likelihood scaled by sample size - list( - "Dl(eta1)" = -c1 * rowMeans(R, dims = r), - "Dl(alphas)" = Map(function(j) { - (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j) - }, 1:r), - "Dl(Omegas)" = Map(function(j) { - deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j])) - # addapt to symmetric constraint for the derivative - dim(deriv) <- c(p[j], p[j]) - deriv + t(deriv * (1 - diag(p[j]))) - }, 1:r) - ) - } - }, - bernoulli = { - require(mvbernoulli) - - initialize <- function(X, Fy) { - p <- head(dim(X), -1) - r <- length(p) - - # Mode-Covariances - XSigmas <- mcov(X, sample.axis = r + 1L) - YSigmas <- mcov(Fy, sample.axis = r + 1L) - - # Extract main mode covariance directions - # Note: (the directions are transposed!) - XDirs <- Map(function(Sigma) { - SVD <- La.svd(Sigma, nu = 0) - sqrt(SVD$d) * SVD$vt - }, XSigmas) - YDirs <- Map(function(Sigma) { - SVD <- La.svd(Sigma, nu = 0) - sqrt(SVD$d) * SVD$vt - }, YSigmas) - - alphas <- Map(function(xdir, ydir) { - s <- min(ncol(xdir), nrow(ydir)) - crossprod(xdir[seq_len(s), , drop = FALSE], - ydir[seq_len(s), , drop = FALSE]) - }, XDirs, YDirs) - - list( - eta1 = array(0, dim = p), - alphas = alphas, - Omegas = Map(diag, p) - ) - } - - params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) { - # number of observations - n <- tail(dim(Fy), 1) - - # natural exponential family parameters - eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1)) - eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas)) - - # # next the conditional Ising model parameters `theta_y` - # theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n) - # dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n) - - # ltri <- which(lower.tri(eta_y2, diag = TRUE)) - # diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri]) - # theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1) - # theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ] - - # conditional Ising model parameters - theta_y <- matrix(rep(vech(eta_y2), n), ncol = n) - ltri <- which(lower.tri(eta_y2, diag = TRUE)) - diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri]) - theta_y[diagonal, ] <- eta_y1 - - theta_y - } - - # Scaled ngative log-likelihood - log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) { - # number of observations - n <- tail(dim(X), 1L) - - # conditional Ising model parameters - theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2) - - # convert to binary data set - storage.mode(X) <- "integer" - X.mvb <- as.mvbinary(mat(X, length(dim(X)))) - - # log-likelihood of the data set - -mean(sapply(seq_len(n), function(i) { - ising_log_likelihood(theta_y[, i], X.mvb[i]) - })) - } - - # Gradient of the scaled negative log-likelihood - grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) { - # retrieve dimensions - n <- tail(dim(X), 1) # sample size - p <- head(dim(X), -1) # predictor dimensions - r <- length(p) # single predictor/response tensor order - - ## "Inverse" Link: Ising Model Specific - # conditional Ising model parameters: `p (p + 1) / 2` by `n` - theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2) - - # conditional expectations - # ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y] - E2 <- apply(theta_y, 2L, ising_marginal_probs) - # convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y] - E2 <- E2[vech.pinv.index(prod(p)), ] - # extract diagonal elements which are equal to E[vec(X) | Y = y] - E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ] - ## end "Inverse" Link - - dim(X) <- c(prod(p), n) - - # Residuals - R <- X - E1 - dim(R) <- c(p, n) - - # mean deviation between the sample covariance to GLM estimated covariance - # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))` - S <- rowMeans(colKronecker(X, X) - E2) - dim(S) <- c(p, p) # reshape to tensor or order `2 r` - - # Gradients of the negative log-likelihood scaled by sample size - list( - "Dl(eta1)" = -c1 * rowMeans(R, dims = r), - "Dl(alphas)" = Map(function(j) { - (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j) - }, 1:r), - "Dl(Omegas)" = Map(function(j) { - deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j])) - # addapt to symmetric constraint for the derivative - dim(deriv) <- c(p[j], p[j]) - deriv + t(deriv * (1 - diag(p[j]))) - }, 1:r) - ) - } - } - ) - - list( - family = name, - initialize = initialize, - params = params, - # linkinv = linkinv, - log.likelihood = log.likelihood, - grad = grad - ) -} - - #' @export GMLM.default <- function(X, Fy, sample.axis = 1L, family = "normal", @@ -292,6 +33,7 @@ GMLM.default <- function(X, Fy, sample.axis = 1L, } } + # optimize likelihood using Nesterov Excelerated Gradient Descent params.fit <- NAGD( fun.loss = function(params) { # scaled negative lig-likelihood diff --git a/tensorPredictors/R/exprs_all_equal.R b/tensorPredictors/R/exprs_all_equal.R index f569ab0..c5f3f93 100644 --- a/tensorPredictors/R/exprs_all_equal.R +++ b/tensorPredictors/R/exprs_all_equal.R @@ -24,7 +24,7 @@ #' matrix(1, 2, 3) #' array(rep(1, 6), dim = c(2, 3)) #' }) -#' # basicaly identical to +#' # is identical to #' stopifnot(exprs = { #' all.equal(matrix(rep(1, 6), 2, 3), matrix(1, 2, 3)) #' all.equal(matrix(rep(1, 6), 2, 3), array(rep(1, 6), dim = c(2, 3))) diff --git a/tensorPredictors/R/make_gmlm_family.R b/tensorPredictors/R/make_gmlm_family.R new file mode 100644 index 0000000..513bb03 --- /dev/null +++ b/tensorPredictors/R/make_gmlm_family.R @@ -0,0 +1,386 @@ + +make.gmlm.family <- function(name) { + # standardize family name + name <- list( + normal = "normal", gaussian = "normal", + bernoulli = "bernoulli", ising = "bernoulli" + )[[tolower(name), exact = FALSE]] + + ############################################################################ + # # + # TODO: better (and possibly specialized) initial parameters!?!?!?! # + # # + ############################################################################ + + switch(name, + ######################################################################## + ### Tensor Normal ### + ######################################################################## + normal = { + initialize <- function(X, Fy) { + p <- head(dim(X), -1) + r <- length(p) + + # Mode-Covariances + XSigmas <- mcov(X, sample.axis = r + 1L) + YSigmas <- mcov(Fy, sample.axis = r + 1L) + + # Extract main mode covariance directions + # Note: (the directions are transposed!) + XDirs <- Map(function(Sigma) { + SVD <- La.svd(Sigma, nu = 0) + sqrt(SVD$d) * SVD$vt + }, XSigmas) + YDirs <- Map(function(Sigma) { + SVD <- La.svd(Sigma, nu = 0) + sqrt(SVD$d) * SVD$vt + }, YSigmas) + + alphas <- Map(function(xdir, ydir) { + s <- min(ncol(xdir), nrow(ydir)) + crossprod(xdir[seq_len(s), , drop = FALSE], + ydir[seq_len(s), , drop = FALSE]) + }, XDirs, YDirs) + + list( + eta1 = rowMeans(X, dims = r), + alphas = alphas, + Omegas = Map(diag, p) + ) + } + + # parameters of the tensor normal computed from the GLM parameters + params <- function(Fy, eta1, alphas, Omegas) { + Deltas <- Map(solve, Omegas) + mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas) + list(mu_y = mu_y, Deltas = Deltas) + } + + # scaled negative log-likelihood + log.likelihood <- function(X, Fy, eta1, alphas, Omegas) { + n <- tail(dim(X), 1) # sample size + + # conditional mean + mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas)) + + # negative log-likelihood scaled by sample size + # Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)` + # migth fail, but `NAGD` has failsaves againt cases of "illegal" + # parameters. + suppressWarnings( + 0.5 * prod(p) * log(2 * pi) + + sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) - + (0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p) + ) + } + + # gradient of the scaled negative log-likelihood + grad <- function(X, Fy, eta1, alphas, Omegas) { + # retrieve dimensions + n <- tail(dim(X), 1) # sample size + p <- head(dim(X), -1) # predictor dimensions + r <- length(p) # single predictor/response tensor order + + ## "Inverse" Link: Tensor Normal Specific + # known exponential family constants + c1 <- 1 + c2 <- -0.5 + + # Covariances from the GLM parameter Scatter matrices + Deltas <- Map(solve, Omegas) + + # First moment via "inverse" link `g1(eta_y) = E[X | Y = y]` + E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas) + + # Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]` + dim(E1) <- c(prod(p), n) + E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1)) + ## end "Inverse" Link + + dim(X) <- c(prod(p), n) + + # Residuals + R <- X - E1 + dim(R) <- c(p, n) + + # mean deviation between the sample covariance to GLM estimated covariance + # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))` + S <- rowMeans(colKronecker(X, X)) - E2 # <- Optimized for Tensor Normal + dim(S) <- c(p, p) # reshape to tensor or order `2 r` + + # Gradients of the negative log-likelihood scaled by sample size + list( + "Dl(eta1)" = -c1 * rowMeans(R, dims = r), + "Dl(alphas)" = Map(function(j) { + (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j) + }, 1:r), + "Dl(Omegas)" = Map(function(j) { + deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j])) + # addapt to symmetric constraint for the derivative + dim(deriv) <- c(p[j], p[j]) + deriv + t(deriv * (1 - diag(p[j]))) + }, 1:r) + ) + } + + # mean conditional Fisher Information + fisher.info <- function(Fy, eta1, alphas, Omegas) { + # retrieve dimensions + n <- tail(dim(Fy), 1) # sample size + p <- dim(eta1) # predictor dimensions + q <- head(dim(Fy), -1) # response dimensions + r <- length(p) # single predictor/response tensor order + + # Hij = Cov(ti(X) %x% tj(X) | Y = y), i, j = 1, 2 + H11 <- Reduce(`%x%`, rev(Map(solve, Omegas))) # covariance + # 3rd central moment is zero + H12 <- H21 <- 0 + # 4th moment by "Isserlis' theorem" a.k.a. "Wick's theorem" + H22 <- kronecker(H11, H11) + dim(H22) <- rep(prod(p), 4) + H22 <- H22 + aperm(H22, c(1, 3, 2, 4)) + aperm(H22, c(1, 3, 2, 4)) + + dim(H11) <- c(p, p) + dim(H22) <- c(p, p, p, p) + + ## Fisher Information: Tensor Normal Specific + # known exponential family constants + c1 <- 1 + c2 <- -0.5 + + # list of (Fy x_{k in [r]\j} alpha_j) + Gys <- Map(function(j) { + mlm(Fy, alphas[-j], (1:r)[-j]) + }, 1:r) + + # setup indices of lower triangular matrix elements used for + # all tuples (j, l) with 1 <= j <= l <= r. + ltri <- lower.tri(diag(r), TRUE) + J <- .col(c(r, r))[ltri] + L <- .row(c(r, r))[ltri] + + # inverse perfect outer shuffle of `2 r` elements + iShuf <- rep(1:r, each = 2) + c(0, r) + + # Getter for the i'th observation from Gys[[j]] + # TODO: this is ugly, find a better (but still dynamic) version + GyGet <- function(i, j) { + obs <- eval(str2lang(paste( + "Gys[[j]][", + paste(rep(",", r), collapse = ""), + "i, drop = FALSE]" + , collapse = ""))) + dim(obs) <- head(dim(obs), -1) + obs + } + + I11 <- c1^2 * mat(H11, 1:r) + I12 <- Map(function(j) { + i11 <- ttt(Gys[[j]], H11, (1:r)[-j]) + # take mean with respect to observations (second mode) + i11 <- colMeans(mat(i11, 2)) + dim(i11) <- c(q[j], p[j], p) + t(mat(i11, 2:1)) + }, 1:r) + I13 <- Map(matrix, 0, prod(p), p^2) + I22 <- Map(function(j, l) { + i22 <- 0 + for (i in 1:n) { + i22 <- i22 + ttt( + ttt(GyGet(i, j), H11, (1:r)[-j]), + GyGet(i, l), (1:r)[-l] + 2, (1:r)[-l]) + } + (c1^2 / n) * mat(i22, 2:1) + }, J, L) + I23 <- Map(matrix, 0, (p * q)[J], (p^2)[L]) + + # shuffled H22 + sH22 <- c2^2 * aperm(H22, c(iShuf, iShuf + 2 * r)) + dim(sH22) <- c(p^2, p^2) + I33 <- Map(function(j, l) { + # second derivative (without constraints) + deriv <- mlm(mlm(sH22, Map(c, Omegas[-j]), (1:r)[-j], + transposed = TRUE), + Map(c, Omegas[-l]), (1:r)[-l] + r, + transposed = TRUE) + dim(deriv) <- c(p[j], p[j], p[l], p[l]) + + # enforce symmetry of `Omega_j` + of.diag <- (slice.index(deriv, 1:2) != slice.index(deriv, 2:1)) + deriv <- deriv + of.diag * aperm(deriv, c(2, 1, 3, 4)) + + # as well as the symmetry of `Omega_l` + of.diag <- (slice.index(deriv, 3:4) != slice.index(deriv, 4:3)) + deriv <- deriv + of.diag * aperm(deriv, c(1, 2, 4, 3)) + + # matricize and return + dim(deriv) <- c(p[j]^2, p[l]^2) + deriv + }, J, L) + + names(I12) <- sprintf("I(eta1, alpha_%d)", 1:r) + names(I13) <- sprintf("I(eta1, Omega_%d)", 1:r) + names(I22) <- sprintf("I(alpha_%d, alpha_%d)", J, L) + names(I23) <- sprintf("I(alpha_%d, Omega_%d)", J, L) + names(I33) <- sprintf("I(Omega_%d, Omega_%d)", J, L) + + list(I11 = I11, I12 = I12, I13 = I13, I22 = I22, I23 = I23, I33 = I33) + } + + # Hessian of the scaled negative log-likelihood + hessian <- function(X, Fy, eta1, alphas, Omegas) { + stop("Not Implemented") + } + }, + + ######################################################################## + ### Multi-Variate Bernoulli ### + ######################################################################## + bernoulli = { + require(mvbernoulli) + + initialize <- function(X, Fy) { + p <- head(dim(X), -1) + r <- length(p) + + # Mode-Covariances + XSigmas <- mcov(X, sample.axis = r + 1L) + YSigmas <- mcov(Fy, sample.axis = r + 1L) + + # Extract main mode covariance directions + # Note: (the directions are transposed!) + XDirs <- Map(function(Sigma) { + SVD <- La.svd(Sigma, nu = 0) + sqrt(SVD$d) * SVD$vt + }, XSigmas) + YDirs <- Map(function(Sigma) { + SVD <- La.svd(Sigma, nu = 0) + sqrt(SVD$d) * SVD$vt + }, YSigmas) + + alphas <- Map(function(xdir, ydir) { + s <- min(ncol(xdir), nrow(ydir)) + crossprod(xdir[seq_len(s), , drop = FALSE], + ydir[seq_len(s), , drop = FALSE]) + }, XDirs, YDirs) + + list( + eta1 = array(0, dim = p), + alphas = alphas, + Omegas = Map(diag, p) + ) + } + + params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) { + # number of observations + n <- tail(dim(Fy), 1) + + # natural exponential family parameters + eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1)) + eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas)) + + # # next the conditional Ising model parameters `theta_y` + # theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n) + # dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n) + + # ltri <- which(lower.tri(eta_y2, diag = TRUE)) + # diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri]) + # theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1) + # theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ] + + # conditional Ising model parameters + theta_y <- matrix(rep(vech(eta_y2), n), ncol = n) + ltri <- which(lower.tri(eta_y2, diag = TRUE)) + diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri]) + theta_y[diagonal, ] <- eta_y1 + + theta_y + } + + # Scaled ngative log-likelihood + log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) { + # number of observations + n <- tail(dim(X), 1L) + + # conditional Ising model parameters + theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2) + + # convert to binary data set + storage.mode(X) <- "integer" + X.mvb <- as.mvbinary(mat(X, length(dim(X)))) + + # log-likelihood of the data set + -mean(sapply(seq_len(n), function(i) { + ising_log_likelihood(theta_y[, i], X.mvb[i]) + })) + } + + # Gradient of the scaled negative log-likelihood + grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) { + # retrieve dimensions + n <- tail(dim(X), 1) # sample size + p <- head(dim(X), -1) # predictor dimensions + r <- length(p) # single predictor/response tensor order + + ## "Inverse" Link: Ising Model Specific + # conditional Ising model parameters: `p (p + 1) / 2` by `n` + theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2) + + # conditional expectations + # ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y] + E2 <- apply(theta_y, 2L, ising_marginal_probs) + # convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y] + E2 <- E2[vech.pinv.index(prod(p)), ] + # extract diagonal elements which are equal to E[vec(X) | Y = y] + E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ] + ## end "Inverse" Link + + dim(X) <- c(prod(p), n) + + # Residuals + R <- X - E1 + dim(R) <- c(p, n) + + # mean deviation between the sample covariance to GLM estimated covariance + # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))` + S <- rowMeans(colKronecker(X, X) - E2) + dim(S) <- c(p, p) # reshape to tensor or order `2 r` + + # Gradients of the negative log-likelihood scaled by sample size + list( + "Dl(eta1)" = -c1 * rowMeans(R, dims = r), + "Dl(alphas)" = Map(function(j) { + (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j) + }, 1:r), + "Dl(Omegas)" = Map(function(j) { + deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j])) + # addapt to symmetric constraint for the derivative + dim(deriv) <- c(p[j], p[j]) + deriv + t(deriv * (1 - diag(p[j]))) + }, 1:r) + ) + } + + # Hessian of the scaled negative log-likelihood + hessian <- function(X, Fy, alphas, Omegas, c1 = 1, c2 = 1) { + stop("Not Implemented") + } + + # Conditional Fisher Information + fisher.info <- function(Fy, alphas, Omegas) { + stop("Not Implemented") + } + } + ) + + list( + family = name, + initialize = initialize, + params = params, + # linkinv = linkinv, + log.likelihood = log.likelihood, + grad = grad, + hessian = hessian, + fisher.info = fisher.info + ) +} diff --git a/tensorPredictors/R/matrixImage.R b/tensorPredictors/R/matrixImage.R index 6c6a8eb..aeb3d9d 100644 --- a/tensorPredictors/R/matrixImage.R +++ b/tensorPredictors/R/matrixImage.R @@ -15,35 +15,43 @@ #' #' @export matrixImage <- function(A, add.values = FALSE, - main = NULL, sub = NULL, interpolate = FALSE, ..., + main = NULL, sub = NULL, interpolate = FALSE, ..., zlim = NA, + axes = TRUE, asp = 1, col = hcl.colors(24, "YlOrRd", rev = FALSE), digits = getOption("digits") ) { # plot raster image - plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "red", - xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main, sub = sub) + plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "black", + xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main, sub = sub, + asp = asp) # Scale values of `A` to [0, 1] with min mapped to 1 and max to 0. - S <- (max(A) - A) / diff(range(A)) + if (missing(zlim)) { + S <- (max(A) - A) / diff(range(A)) + } else { + S <- pmin(pmax(0, (zlim[2] - A) / diff(zlim)), 1) + } + # and not transform to color + S <- matrix(col[round((length(col) - 1) * S + 1)], nrow = nrow(A)) # Raster Image ploting the matrix with element values mapped to grayscale # as big elements (original matrix A) are dark and small (negative) elements # are white. rasterImage(S, 0, 0, ncol(A), nrow(A), interpolate = interpolate, ...) - # Add X-axis giving index + # X/Y axes index (matches coordinates to matrix indices) x <- seq(1, ncol(A), by = 1) - axis(1, at = x - 0.5, labels = x, lwd = 0, lwd.ticks = 1) - # Add Y-axis y <- seq(1, nrow(A)) - axis(2, at = y - 0.5, labels = rev(y), lwd = 0, lwd.ticks = 1, las = 1) + if (axes) { + axis(1, at = x - 0.5, labels = x, lwd = 0, lwd.ticks = 1) + axis(2, at = y - 0.5, labels = rev(y), lwd = 0, lwd.ticks = 1, las = 1) + } - # Writes matrix values (in colored element grids) + # Writes matrix values if (any(add.values)) { if (length(add.values) > 1) { A[!add.values] <- NA A[add.values] <- format(A[add.values], digits = digits) } - text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A, - adj = 0.5, col = as.integer(S > 0.65)) + text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A, adj = 0.5) } } diff --git a/tensorPredictors/R/num_deriv.R b/tensorPredictors/R/num_deriv.R index def2d28..9cb1375 100644 --- a/tensorPredictors/R/num_deriv.R +++ b/tensorPredictors/R/num_deriv.R @@ -3,34 +3,101 @@ #' @example inst/examples/num_deriv.R #' #' @export -num.deriv <- function(F, X, h = 1e-6, sym = FALSE) { +num.deriv <- function(expr, ..., h = 1e-6, sym = FALSE) { + sexpr <- substitute(expr) + if (...length() != 1) { + stop("Expectd exactly one '...' variable") + } + var <- ...names()[1] + if (is.null(var)) { + arg <- substitute(...) + var <- if (is.symbol(arg)) as.character(arg) else "x" + } + + if (is.language(sexpr) && !is.symbol(sexpr) && sexpr[[1]] == as.symbol("function")) { + func <- expr + } else { + if (is.name(sexpr)) { + expr <- call(as.character(sexpr), as.name(var)) + } else { + if ((!is.call(sexpr) && !is.expression(sexpr)) + || !(var %in% all.vars(sexpr))) { + stop("'expr' must be a function or expression containing '", var, "'") + } + expr <- sexpr + } + + args <- as.pairlist(structure(list(alist(x = )[[1]]), names = var)) + func <- as.function(c(args, expr), envir = parent.frame()) + } + + num.deriv.function(func, ..1, h = h, sym = sym) +} + +#' @rdname num.deriv +#' @export +num.deriv.function <- function(FUN, X, h = 1e-6, sym = FALSE) { if (sym) { stopifnot(isSymmetric(X)) p <- nrow(X) k <- seq_along(X) - 1 mapply(function(i, j) { dx <- h * ((k == i * p + j) | (k == j * p + i)) - (F(X + dx) - F(X - dx)) / (2 * h) + (FUN(X + dx) - FUN(X - dx)) / (2 * h) }, .row(dim(X)) - 1, .col(dim(X)) - 1) } else { sapply(seq_along(X), function(i) { dx <- h * (seq_along(X) == i) - (F(X + dx) - F(X - dx)) / (2 * h) + (FUN(X + dx) - FUN(X - dx)) / (2 * h) }) } } -#' Second numeric derivative -#' +#' @rdname num.deriv #' @export -num.deriv2 <- function(F, X, Y, h = 1e-6, symX = FALSE, symY = FALSE) { +num.deriv2 <- function(FUN, X, Y, h = 1e-6, symX = FALSE, symY = FALSE) { if (missing(Y)) { - num.deriv(function(x) { - num.deriv(function(z) F(z), x, h = h, sym = symX) + num.deriv.function(function(x) { + num.deriv.function(FUN, x, h = h, sym = symX) }, X, h = h, sym = symX) } else { - num.deriv(function(y) { - num.deriv(function(x) F(x, y), X, h = h, sym = symX) + num.deriv.function(function(y) { + num.deriv.function(function(x) FUN(x, y), X, h = h, sym = symX) }, Y, h = h, sym = symY) } } + + +### Interface Idea / Demo +# num.deriv2.function +# num.deriv2 <- function(expr, var) { +# sexpr <- substitute(expr) +# svar <- substitute(var) +# +# if (is.language(sexpr) && !is.symbol(sexpr) && sexpr[[1]] == as.symbol("function")) { +# func <- expr +# } else { +# if (is.name(sexpr)) { +# expr <- call(as.character(sexpr), as.name(svar)) +# } else { +# if ((!is.call(sexpr) && !is.expression(sexpr)) +# || !(as.character(svar) %in% all.vars(sexpr))) { +# stop("'expr' must be a function or expression containing '", +# as.character(svar), "'") +# } +# expr <- sexpr +# } +# +# args <- as.pairlist(structure(list(alist(x = )[[1]]), names = as.character(svar))) +# func <- as.function(c(args, expr), envir = parent.frame()) +# } +# +# num.deriv2.function(func, var) +# } +# y <- c(pi, exp(1), (sqrt(5) + 1) / 2) +# num.deriv2(function(x) x^2 + y, x)(1:3) +# num.deriv2(x^2 + y, x)(1:3) +# func <- function(x) x^2 + y +# num.deriv2(func, x)(1:3) +# func2 <- function(z = y, x) x^2 + z +# num.deriv2(func2, x)(1:3) diff --git a/tensorPredictors/inst/examples/num_deriv.R b/tensorPredictors/inst/examples/num_deriv.R index 4ca2006..c711c52 100644 --- a/tensorPredictors/inst/examples/num_deriv.R +++ b/tensorPredictors/inst/examples/num_deriv.R @@ -4,15 +4,17 @@ A <- A + t(A) B <- matrix(rnorm(3 * 4), 3, 4) # F(A) = A B -# DF(A) = B' x I -stopifnot(all.equal( - num.deriv(function(X) X %*% B, A), +# DF(A) = B' %x% I +exprs.all.equal({ t(B) %x% diag(nrow(A)) -)) + num.deriv(function(X) X %*% B, A) + num.deriv(A %*% B, A) + num.deriv(X %*% B, X = A) +}) # Symmetric case, constraint A = A' (equiv to being a function of vech(A) only) # F(A) = A B for A = A' -# DF(A) = B' x I +# DF(A) = B' %x% I stopifnot(all.equal( num.deriv(function(X) X %*% B, A, sym = TRUE), (t(B) %x% diag(nrow(A))) %*% D(nrow(A)) %*% t(D(nrow(A))) diff --git a/tensorPredictors/tests/testthat.R b/tensorPredictors/tests/testthat.R new file mode 100644 index 0000000..6224019 --- /dev/null +++ b/tensorPredictors/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(tensorPredictors) + +test_check("tensorPredictors") diff --git a/tensorPredictors/tests/testthat/test-MatrixCalculus.R b/tensorPredictors/tests/testthat/test-MatrixCalculus.R new file mode 100644 index 0000000..1b35726 --- /dev/null +++ b/tensorPredictors/tests/testthat/test-MatrixCalculus.R @@ -0,0 +1,50 @@ +# F(X) = log(det(X)) if det(X) > 0 +# dF(X) = tr(X^-1 dX) +# DF(X) = vec((X^-1)')' +test_that("Matrix Calculus 1", { + # test data + X <- tcrossprod(diag(5) + matrix(runif(5^2, -.1, .1), 5)) + + num.grad <- num.deriv(function(X) log(det(X)), X) + ana.grad <- c(solve(X)) + + expect_equal(num.grad, ana.grad) +}) + +# F(mu) = for Delta_k = Delta_k' +# DF(mu) = -2 vec((X - mu) x_{k in [r]} Delta_K^-1)' +test_that("Matrix Calculus 2", { + p <- c(2, 3, 5) + + X <- array(rnorm(prod(p)), dim = p) + mu <- array(rnorm(prod(p)), dim = p) + Deltas <- Map(function(pj) tcrossprod(diag(pj) + runif(pj^2, -0.1, 0.1)), p) + + ana.grad <- -2 * c(mlm(X - mu, Map(solve, Deltas))) + num.grad <- num.deriv(function(mu) sum((X - mu) * mlm(X - mu, Map(solve, Deltas))), mu) + + expect_equal(num.grad, ana.grad) +}) + +# F(Delta_j) = for Delta_k = Delta_k' +# DF(Delta_j) = -((X - mu) x_{k in [r]} Delta_K^-1)_(j) (X - mu)_(j)' Delta_j^-1 +test_that("Matrix Calculus 3", { + # config + p <- c(2, 3, 5) + + # generate test data + X <- array(rnorm(prod(p)), dim = p) + mu <- array(rnorm(prod(p)), dim = p) + Deltas <- Map(function(pj) tcrossprod(diag(pj) + runif(pj^2, -0.1, 0.1)), p) + + # check analytic to numeric derivatives + for (j in seq_along(Deltas)) { + num.grad <- matrix(num.deriv(function(Delta_j) { + Deltas[[j]] <- Delta_j + sum((X - mu) * mlm(X - mu, Map(solve, Deltas))) + }, Deltas[[j]]), p[j]) + ana.grad <- -mcrossprod(mlm(X - mu, Map(solve, Deltas)), X - mu, j) %*% solve(Deltas[[j]]) + + expect_equal(num.grad, ana.grad) + } +}) diff --git a/tensorPredictors/tests/testthat/test-TensorNormalFamily.R b/tensorPredictors/tests/testthat/test-TensorNormalFamily.R new file mode 100644 index 0000000..d6a5b2e --- /dev/null +++ b/tensorPredictors/tests/testthat/test-TensorNormalFamily.R @@ -0,0 +1,186 @@ +test_that("Tensor-Normal Score", { + # setup dimensions + n <- 11L + p <- c(3L, 5L, 4L) + q <- c(2L, 7L, 3L) + + # create "true" GLM parameters + eta1 <- array(rnorm(prod(p)), dim = p) + alphas <- Map(matrix, Map(rnorm, p * q), p) + Omegas <- Map(function(pj) { + solve(0.5^abs(outer(seq_len(pj), seq_len(pj), `-`))) + }, p) + params <- list(eta1, alphas, Omegas) + + # compute tensor normal parameters from GLM parameters + Deltas <- Map(solve, Omegas) + mu <- mlm(eta1, Deltas) + + # sample some test data + sample.axis <- length(p) + 1L + Fy <- array(rnorm(n * prod(q)), dim = c(q, n)) + X <- mlm(Fy, Map(`%*%`, Deltas, alphas)) + rtensornorm(n, mu, Deltas, sample.axis) + + # Create a GLM family object + family <- make.gmlm.family("normal") + + # first unpack the family object + log.likelihood <- family$log.likelihood + grad <- family$grad + + # compare numeric gradient against the analytic gradient provided by the + # family at the initial parameters + grad.num <- do.call(function(eta1, alphas, Omegas) { + list( + "Dl(eta1)" = num.deriv.function(function(eta1) { + log.likelihood(X, Fy, eta1, alphas, Omegas) + }, eta1), + "Dl(alphas)" = Map(function(j) num.deriv.function(function(alpha_j) { + alphas[[j]] <- alpha_j + log.likelihood(X, Fy, eta1, alphas, Omegas) + }, alphas[[j]]), seq_along(alphas)), + "Dl(Omegas)" = Map(function(j) num.deriv.function(function(Omega_j) { + Omegas[[j]] <- Omega_j + log.likelihood(X, Fy, eta1, alphas, Omegas) + }, Omegas[[j]], sym = TRUE), seq_along(Omegas)) + ) + }, params) + grad.ana <- do.call(grad, c(list(X, Fy), params)) + + expect_equal(RMap(c, grad.ana), grad.num, tolerance = 1e-6) + + # test for correct dimensions + ana.dims <- list( + "Dl(eta1)" = p, + "Dl(alphas)" = Map(c, p, q), + "Dl(Omegas)" = Map(c, p, p) + ) + + expect_identical(RMap(dim, grad.ana), ana.dims) +}) + +test_that("Tensor-Normal moments of the sufficient statistic t(X)", { + # config + # # (estimated) sample size for sample estimates + # n <- 250000 # sample estimates are too unreliable for testing purposes + p <- c(2L, 3L) + r <- length(p) + + # setup tensor normal parameters (NO dependence of Fy, set to zero) + mu <- array(runif(prod(p), min = -0.5, max = 0.5), dim = p) # = mu_y + Deltas <- Map(function(pj) 0.5^abs(outer(1:pj, 1:pj, `-`)), p) + + # compute GMLM parameters (NO alphas as Fy is zero) + Omegas <- Map(solve, Deltas) + eta1 <- mlm(mu, Omegas) + + # natural parameters of the tensor normal + eta_y1 <- eta1 # + mlm(Fy, alphas) = + 0 + eta_y2 <- -0.5 * Reduce(`%x%`, rev(Omegas)) + + # draw a sample + X <- rtensornorm(n, mu, Deltas, r + 1L) + + # tensor normal log-partition function given eta_y + log.partition <- function(eta_y1, eta_y2) { + # eta_y1 as "model" matrix of dimensions `n by p` where `n` might be `1` + if (length(eta_y1)^2 == length(eta_y2)) { + pp <- length(eta_y1) + dim(eta_y1) <- c(1L, pp) + } else { + eta_y1 <- mat(eta_y1, r + 1L) + pp <- ncol(eta_y1) + } + + # treat eta_y2 as square matrix + if (!is.matrix(eta_y2)) { + dim(eta_y2) <- c(pp, pp) + } + + # evaluate log-partiton function in terms of natural parameters + -0.25 * pp * mean((eta_y1 %*% solve(eta_y2)) * eta_y1) - 0.5 * log(det(-2 * eta_y2)) + } + + + # (analytic) first and second (un-centered) moment of the tensor normal + # Eti = E[ti(X) | Y = y] + Et1.ana <- mu + Et2.ana <- Reduce(`%x%`, rev(Deltas)) + outer(c(mu), c(mu)) + + # (numeric) derivative of the log-partition function with respect to the natural + # parameters of the exponential family form of the tensor normal + Et1.num <- num.deriv(log.partition(eta_y1, eta_y2), eta_y1) + Et2.num <- num.deriv(log.partition(eta_y1, eta_y2), eta_y2) + + # # (estimated) estimate from sample + # Et1.est <- rowMeans(X, dims = r) + # Et2.est <- colMeans(rowKronecker(mat(X, r + 1L), mat(X, r + 1L))) + + # (analytic) convariance blocks of the sufficient statistic + # Ctij = Cov(ti(X), tj(X) | Y = y) + Ct11.ana <- Reduce(`%x%`, rev(Deltas)) + Ct12.ana <- local({ + # Analytic solution / `vec(mu %x% Delta) + vec(mu' %x% Delta)` + ct12 <- 2 * c(mu) %x% Reduce(`%x%`, rev(Deltas)) + # and symmetrize! + dim(ct12) <- rep(prod(p), 3) + (1 / 2) * (ct12 + aperm(ct12, c(3, 1, 2))) + }) + Ct22.ana <- local({ + Delta <- Reduce(`%x%`, rev(Deltas)) # Ct11.ana + ct22 <- (2 * Delta + 4 * outer(c(mu), c(mu))) %x% Delta + + # TODO: What does this symmetrization do exactly? And why?! + dim(ct22) <- rep(prod(p), 4) + (1 / 4) * ( + aperm(ct22, c(2, 3, 1, 4)) + + aperm(ct22, c(2, 3, 4, 1)) + + aperm(ct22, c(3, 2, 1, 4)) + + aperm(ct22, c(3, 2, 4, 1)) + ) + }) + + # (numeric) second derivative of the log-partition function + Ct11.num <- num.deriv2(function(eta_y1) log.partition(eta_y1, eta_y2), eta_y1) + Ct12.num <- num.deriv2(log.partition, eta_y1, eta_y2) + Ct22.num <- local({ + ct22 <- num.deriv2(function(eta_y2) log.partition(eta_y1, eta_y2), eta_y2) + # TODO: Why does this need to be symmetrized?! + dim(ct22) <- rep(prod(p), 4) + (1 / 4) * ( + aperm(ct22, c(3, 4, 2, 1)) + + aperm(ct22, c(4, 3, 2, 1)) + + aperm(ct22, c(3, 4, 1, 2)) + + aperm(ct22, c(4, 3, 1, 2)) + ) + }) + + # # (estimated) sample estimates of the sufficient statistic convariance blocks + # T1 <- mat(X, r + 1L) + # T2 <- rowKronecker(T1, T1) + # Ct11.est <- cov(T1) + # Ct12.est <- cov(T1, T2) + # Ct22.est <- cov(T2, T2) + + + tolerance <- 1e-3 + expect_equal(c(Et1.ana), c(Et1.num), tolerance = tolerance) + # expect_equal(c(Et1.ana), c(Et1.est), tolerance = 0.05, scale = 1) + # expect_equal(c(Et1.num), c(Et1.est), tolerance = 0.05, scale = 1) + + expect_equal(c(Et2.ana), c(Et2.num), tolerance = tolerance) + # expect_equal(c(Et2.ana), c(Et2.est), tolerance = 0.05, scale = 1) + # expect_equal(c(Et2.num), c(Et2.est), tolerance = 0.05, scale = 1) + + expect_equal(c(Ct11.ana), c(Ct11.num), tolerance = tolerance) + # expect_equal(c(Ct11.ana), c(Ct11.est), tolerance = 0.05, scale = 1) + # expect_equal(c(Ct11.num), c(Ct11.est), tolerance = 0.05, scale = 1) + + expect_equal(c(Ct12.ana), c(Ct12.num), tolerance = tolerance) + # expect_equal(c(Ct12.ana), c(Ct12.est), tolerance = 0.05, scale = 1) + # expect_equal(c(Ct12.num), c(Ct12.est), tolerance = 0.05, scale = 1) + + expect_equal(c(Ct22.ana), c(Ct22.num), tolerance = tolerance) + # expect_equal(c(Ct22.ana), c(Ct22.est), tolerance = 0.05, scale = 1) + # expect_equal(c(Ct22.num), c(Ct22.est), tolerance = 0.05, scale = 1) +})