fix: some typos in GMLM.tex,

add: extended interface of num.deriv and matrixImage, add: "tests", add: moved make.gmlm.family into seperate file
2022-10-31 15:14:58 +01:00 · 2022-10-31 15:14:58 +01:00 · 2dfdc7083b
commit 2dfdc7083b
parent 79f16c58dc
11 changed files with 746 additions and 298 deletions
--- a/LaTeX/GMLM.tex
+++ b/LaTeX/GMLM.tex
@ -139,11 +139,12 @@
 \maketitle
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{abstract}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-    We consider regression and classification for \textit{general} response and tensor-valued predictors (multi dimensional arrays) and propose a \textit{novel formulation} for  method for sufficient dimension reduction.  Assumingof Tensor-valued predictor (multi dimensional arrays) for regression or classification. the distribution of the tensor-valued predictors given the response is in theWe assume an qQuadratic eExponential fFamily, we model the natural parameter   for a Generalized Linear Model in an inverse regression setting where the relation via a link is of as a multi-linear function of the responsnature.
+    We consider regression and classification for \textit{general} response and tensor-valued predictors (multi dimensional arrays) and propose a \textit{novel formulation} for sufficient dimension reduction. Assuming the distribution of the tensor-valued predictors given the response is in the quadratic exponential family, we model the natural parameter as a multi-linear function of the respons.
-    ThisUsing a multi-linear relation allows to perform per-axis reductions that drastically which reduces the total number of parameters drastically for higher order tTensor-valued predictors. WUnder the Exponential Family we derive maximum likelihood estimates for the multi-linear sufficient dimension reduction and  of the Tensor-valued predictors. Furthermore, we provide a computationally efficient n estimation algorithm which leveragutilizes the tTensor structure allowing efficient implementations. The performance of the method is illustrated via simulations and real world examples are provided.
+    This allows per-axis reductions that drastically reduce the total number of parameters for higher order tensor-valued predictors. We derive maximum likelihood estimates for the sufficient dimension reduction and a computationally efficient estimation algorithm which leveraes the tensor structure. The performance of the method is illustrated via simulations and real world examples are provided.
 \end{abstract}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -523,13 +524,13 @@ The next step is to identify the Hessians from the second differentials in a sim
            &\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \vec\biggl( (\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j) \Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))} \biggr) \\
        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big)_{(j)}\otimes\mat{I}_{p_j} \biggr) \mat{K}_{\mat{p},(j)}\mat{H}_{1,1}\t{\mat{K}_{\mat{p},(l)}} \biggl( \t{\Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big)_{(l)}}\otimes\mat{I}_{p_l} \biggr)\vec{\d\mat{\alpha}_l} \\
            &\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \biggl( \t{\Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))}}\otimes\mat{I}_{p_j p_l} \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\
-        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \vec{\d\mat{\alpha}_l} \\
+        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \vec{\d\mat{\alpha}_l} \\
            &\qquad + c_1 \vec\biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\
-        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \vec{\d\mat{\alpha}_l} \\
+        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \vec{\d\mat{\alpha}_l} \\
            &\qquad + c_1 \t{(\vec{\d\mat{\alpha}_j})} \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3))} \vec{\d\mat{\alpha}_l} \\
                &\qquad \begin{aligned}
                    \Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\alpha}_l})}} &=
-                            -c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \\
+                            -c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\
                        &\qquad + c_1 \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3) + [[j > l]])}
                        \qquad{\color{gray} (p_j q_j \times p_l q_l)}
                \end{aligned}
@ -591,22 +592,23 @@ and with that all auxiliary calculation are done. Using the two expectations yie
        \mathcal{I}_{2 r + 1,1} & \mathcal{I}_{2 r + 1,1} & \cdots & \mathcal{I}_{2 r + 1, 2 r + 1}
    \end{pmatrix}
 \end{displaymath}
-for $j, l = 1, ..., r$ as follows
+and for every block holds $\mathcal{I}_{j, l} = \t{\mathcal{I}_{l, j}}$. The individual blocks are given with $j = 1, ..., r$ and $j \leq l \leq r$ by
 \begin{align*}
    \mathcal{I}_{1,1} &= c_1^2 (\ten{H}_{1,1})_{([r])} \\
-    \mathcal{I}_{1,j+1} = \t{\mathcal{I}_{j+1,1}}
+    \mathcal{I}_{1,j+1} % = \E\partial_{\vec{\overline{\ten{\eta}}_1}}\partial_{\t{(\vec{\mat{\alpha}_j})}} l(\mat{\Theta})\mid \ten{Y} = y
        &= c_1^2 \Big[\Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1}\Big]_{((2, 1))} \\
-    \mathcal{I}_{1,j+1} = \t{\mathcal{I}_{j+1,1}}
+    \mathcal{I}_{1,j+r+1}
        &= c_1 c_2 \Big( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)} \\
-    \mathcal{I}_{j+1,l+1} = \t{\mathcal{I}_{l+1,j+1}}
+    \mathcal{I}_{j+1,l+1}
        &= c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\
-    \mathcal{I}_{j+1,l+r+1} = \t{\mathcal{I}_{l+r+1,j+1}}
+    \mathcal{I}_{j+1,l+r+1}
        &= c_1 c_2 \biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{((r + 2, 1))} \\
-    \mathcal{I}_{j+r+1,l+r+1} = \t{\mathcal{I}_{l+r+1,j+r+1}}
+    \mathcal{I}_{j+r+1,l+r+1}
        &= c_2^2 \Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}
 \end{align*}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                  \section{Vectorization and Matricization}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -615,7 +617,7 @@ The \emph{matricization} is a generalization of the \emph{vectorization} operati
 \begin{theorem}\label{thm:mlm_mat}
    Let $\ten{A}$ be a tensor of order $r$ with the dimensions $q_1\times ... \times q_r$. Furthermore, let for $k = 1, ..., r$ be $\mat{B}_k$ matrices of dimensions $p_k\times q_k$. Then, for any $(\mat{i}, \mat{j})\in\perm{r}$ holds
    \begin{displaymath}
-        (\ten{A}\times_{k\in[r]}\mat{B}_k)_{(\mat{i}, \mat{j})}
+        \Big(\ten{A}\mlm{k\in[r]}\mat{B}_k\Big)_{(\mat{i}, \mat{j})}
            = \Big(\bigotimes_{k = \len{\mat{i}}}^{1}\mat{B}_{\mat{i}_k}\Big) \ten{A}_{(\mat{i}, \mat{j})} \Big(\bigotimes_{k = \len{\mat{j}}}^{1}\t{\mat{B}_{\mat{j}_k}}\Big).
    \end{displaymath}
 \end{theorem}
--- a/tensorPredictors/NAMESPACE
+++ b/tensorPredictors/NAMESPACE
@ -47,6 +47,7 @@ export(mkm)
 export(mlm)
 export(mtvk)
 export(num.deriv)
 export(num.deriv.function)
 export(num.deriv2)
 export(reduce)
 export(rowKronecker)
--- a/tensorPredictors/R/GMLM.R
+++ b/tensorPredictors/R/GMLM.R
@ -5,265 +5,6 @@ GMLM <- function(...) {
    stop("Not Implemented")
 }
 make.gmlm.family <- function(name) {
    # standardize family name
    name <- list(
        normal = "normal", gaussian = "normal",
        bernoulli = "bernoulli", ising = "bernoulli"
    )[[tolower(name), exact = FALSE]]
    ############################################################################
    #                                                                          #
    #    TODO: better (and possibly specialized) initial parameters!?!?!?!     #
    #                                                                          #
    ############################################################################
    switch(name,
        normal = {
            initialize <- function(X, Fy) {
                p <- head(dim(X), -1)
                r <- length(p)
                # Mode-Covariances
                XSigmas <- mcov(X, sample.axis = r + 1L)
                YSigmas <- mcov(Fy, sample.axis = r + 1L)
                # Extract main mode covariance directions
                # Note: (the directions are transposed!)
                XDirs <- Map(function(Sigma) {
                    SVD <- La.svd(Sigma, nu = 0)
                    sqrt(SVD$d) * SVD$vt
                }, XSigmas)
                YDirs <- Map(function(Sigma) {
                    SVD <- La.svd(Sigma, nu = 0)
                    sqrt(SVD$d) * SVD$vt
                }, YSigmas)
                alphas <- Map(function(xdir, ydir) {
                    s <- min(ncol(xdir), nrow(ydir))
                    crossprod(xdir[seq_len(s), , drop = FALSE],
                              ydir[seq_len(s), , drop = FALSE])
                }, XDirs, YDirs)
                list(
                    eta1 = rowMeans(X, dims = r),
                    alphas = alphas,
                    Omegas = Map(diag, p)
                )
            }
            # parameters of the tensor normal computed from the GLM parameters
            params <- function(Fy, eta1, alphas, Omegas) {
                Deltas <- Map(solve, Omegas)
                mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
                list(mu_y = mu_y, Deltas = Deltas)
            }
            # scaled negative log-likelihood
            log.likelihood <- function(X, Fy, eta1, alphas, Omegas) {
                n <- tail(dim(X), 1)        # sample size
                # conditional mean
                mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas))
                # negative log-likelihood scaled by sample size
                # Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)`
                # migth fail, but `NAGD` has failsaves againt cases of "illegal"
                # parameters.
                suppressWarnings(
                    0.5 * prod(p) * log(2 * pi) +
                        sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) -
                            (0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p)
                )
            }
            # gradient of the scaled negative log-likelihood
            grad <- function(X, Fy, eta1, alphas, Omegas) {
                # retrieve dimensions
                n <- tail(dim(X), 1)        # sample size
                p <- head(dim(X), -1)       # predictor dimensions
                r <- length(p)              # single predictor/response tensor order
                ## "Inverse" Link: Tensor Normal Specific
                # known exponential family constants
                c1 <- 1
                c2 <- -0.5
                # Covariances from the GLM parameter Scatter matrices
                Deltas <- Map(solve, Omegas)
                # First moment via "inverse" link `g1(eta_y) = E[X | Y = y]`
                E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
                # Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]`
                dim(E1) <- c(prod(p), n)
                E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1))
                ## end "Inverse" Link
                dim(X) <- c(prod(p), n)
                # Residuals
                R <- X - E1
                dim(R) <- c(p, n)
                # mean deviation between the sample covariance to GLM estimated covariance
                # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
                S <- rowMeans(colKronecker(X, X)) - E2  # <- Optimized for Tensor Normal
                dim(S) <- c(p, p)           # reshape to tensor or order `2 r`
                # Gradients of the negative log-likelihood scaled by sample size
                list(
                    "Dl(eta1)" = -c1 * rowMeans(R, dims = r),
                    "Dl(alphas)" = Map(function(j) {
                        (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
                    }, 1:r),
                    "Dl(Omegas)" = Map(function(j) {
                        deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
                        # addapt to symmetric constraint for the derivative
                        dim(deriv) <- c(p[j], p[j])
                        deriv + t(deriv * (1 - diag(p[j])))
                    }, 1:r)
                )
            }
        },
        bernoulli = {
            require(mvbernoulli)
            initialize <- function(X, Fy) {
                p <- head(dim(X), -1)
                r <- length(p)
                # Mode-Covariances
                XSigmas <- mcov(X, sample.axis = r + 1L)
                YSigmas <- mcov(Fy, sample.axis = r + 1L)
                # Extract main mode covariance directions
                # Note: (the directions are transposed!)
                XDirs <- Map(function(Sigma) {
                    SVD <- La.svd(Sigma, nu = 0)
                    sqrt(SVD$d) * SVD$vt
                }, XSigmas)
                YDirs <- Map(function(Sigma) {
                    SVD <- La.svd(Sigma, nu = 0)
                    sqrt(SVD$d) * SVD$vt
                }, YSigmas)
                alphas <- Map(function(xdir, ydir) {
                    s <- min(ncol(xdir), nrow(ydir))
                    crossprod(xdir[seq_len(s), , drop = FALSE],
                              ydir[seq_len(s), , drop = FALSE])
                }, XDirs, YDirs)
                list(
                    eta1 = array(0, dim = p),
                    alphas = alphas,
                    Omegas = Map(diag, p)
                )
            }
            params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
                # number of observations
                n <- tail(dim(Fy), 1)
                # natural exponential family parameters
                eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1))
                eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas))
                # # next the conditional Ising model parameters `theta_y`
                # theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n)
                # dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n)
                # ltri <- which(lower.tri(eta_y2, diag = TRUE))
                # diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
                # theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1)
                # theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ]
                # conditional Ising model parameters
                theta_y <- matrix(rep(vech(eta_y2), n), ncol = n)
                ltri <- which(lower.tri(eta_y2, diag = TRUE))
                diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
                theta_y[diagonal, ] <- eta_y1
                theta_y
            }
            # Scaled ngative log-likelihood
            log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
                # number of observations
                n <- tail(dim(X), 1L)
                # conditional Ising model parameters
                theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
                # convert to binary data set
                storage.mode(X) <- "integer"
                X.mvb <- as.mvbinary(mat(X, length(dim(X))))
                # log-likelihood of the data set
                -mean(sapply(seq_len(n), function(i) {
                    ising_log_likelihood(theta_y[, i], X.mvb[i])
                }))
            }
            # Gradient of the scaled negative log-likelihood
            grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
                # retrieve dimensions
                n <- tail(dim(X), 1)        # sample size
                p <- head(dim(X), -1)       # predictor dimensions
                r <- length(p)              # single predictor/response tensor order
                ## "Inverse" Link: Ising Model Specific
                # conditional Ising model parameters: `p (p + 1) / 2` by `n`
                theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
                # conditional expectations
                #   ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y]
                E2 <- apply(theta_y, 2L, ising_marginal_probs)
                #   convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y]
                E2 <- E2[vech.pinv.index(prod(p)), ]
                #   extract diagonal elements which are equal to E[vec(X) | Y = y]
                E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ]
                ## end "Inverse" Link
                dim(X) <- c(prod(p), n)
                # Residuals
                R <- X - E1
                dim(R) <- c(p, n)
                # mean deviation between the sample covariance to GLM estimated covariance
                # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
                S <- rowMeans(colKronecker(X, X) - E2)
                dim(S) <- c(p, p)           # reshape to tensor or order `2 r`
                # Gradients of the negative log-likelihood scaled by sample size
                list(
                    "Dl(eta1)" = -c1 * rowMeans(R, dims = r),
                    "Dl(alphas)" = Map(function(j) {
                        (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
                    }, 1:r),
                    "Dl(Omegas)" = Map(function(j) {
                        deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
                        # addapt to symmetric constraint for the derivative
                        dim(deriv) <- c(p[j], p[j])
                        deriv + t(deriv * (1 - diag(p[j])))
                    }, 1:r)
                )
            }
        }
    )
    list(
        family = name,
        initialize = initialize,
        params = params,
        # linkinv = linkinv,
        log.likelihood = log.likelihood,
        grad = grad
    )
 }
 #' @export
 GMLM.default <- function(X, Fy, sample.axis = 1L,
    family = "normal",
@ -292,6 +33,7 @@ GMLM.default <- function(X, Fy, sample.axis = 1L,
        }
    }
    # optimize likelihood using Nesterov Excelerated Gradient Descent
    params.fit <- NAGD(
        fun.loss = function(params) {
            # scaled negative lig-likelihood
--- a/tensorPredictors/R/exprs_all_equal.R
+++ b/tensorPredictors/R/exprs_all_equal.R
@ -24,7 +24,7 @@
 #'     matrix(1, 2, 3)
 #'     array(rep(1, 6), dim = c(2, 3))
 #' })
-#' # basicaly identical to
+#' # is identical to
 #' stopifnot(exprs = {
 #'     all.equal(matrix(rep(1, 6), 2, 3), matrix(1, 2, 3))
 #'     all.equal(matrix(rep(1, 6), 2, 3), array(rep(1, 6), dim = c(2, 3)))
--- a/tensorPredictors/R/make_gmlm_family.R
+++ b/tensorPredictors/R/make_gmlm_family.R
@ -0,0 +1,386 @@
 make.gmlm.family <- function(name) {
    # standardize family name
    name <- list(
        normal = "normal", gaussian = "normal",
        bernoulli = "bernoulli", ising = "bernoulli"
    )[[tolower(name), exact = FALSE]]
    ############################################################################
    #                                                                          #
    #    TODO: better (and possibly specialized) initial parameters!?!?!?!     #
    #                                                                          #
    ############################################################################
    switch(name,
        ########################################################################
        ###                          Tensor Normal                           ###
        ########################################################################
        normal = {
            initialize <- function(X, Fy) {
                p <- head(dim(X), -1)
                r <- length(p)
                # Mode-Covariances
                XSigmas <- mcov(X, sample.axis = r + 1L)
                YSigmas <- mcov(Fy, sample.axis = r + 1L)
                # Extract main mode covariance directions
                # Note: (the directions are transposed!)
                XDirs <- Map(function(Sigma) {
                    SVD <- La.svd(Sigma, nu = 0)
                    sqrt(SVD$d) * SVD$vt
                }, XSigmas)
                YDirs <- Map(function(Sigma) {
                    SVD <- La.svd(Sigma, nu = 0)
                    sqrt(SVD$d) * SVD$vt
                }, YSigmas)
                alphas <- Map(function(xdir, ydir) {
                    s <- min(ncol(xdir), nrow(ydir))
                    crossprod(xdir[seq_len(s), , drop = FALSE],
                              ydir[seq_len(s), , drop = FALSE])
                }, XDirs, YDirs)
                list(
                    eta1 = rowMeans(X, dims = r),
                    alphas = alphas,
                    Omegas = Map(diag, p)
                )
            }
            # parameters of the tensor normal computed from the GLM parameters
            params <- function(Fy, eta1, alphas, Omegas) {
                Deltas <- Map(solve, Omegas)
                mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
                list(mu_y = mu_y, Deltas = Deltas)
            }
            # scaled negative log-likelihood
            log.likelihood <- function(X, Fy, eta1, alphas, Omegas) {
                n <- tail(dim(X), 1)        # sample size
                # conditional mean
                mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas))
                # negative log-likelihood scaled by sample size
                # Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)`
                # migth fail, but `NAGD` has failsaves againt cases of "illegal"
                # parameters.
                suppressWarnings(
                    0.5 * prod(p) * log(2 * pi) +
                        sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) -
                            (0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p)
                )
            }
            # gradient of the scaled negative log-likelihood
            grad <- function(X, Fy, eta1, alphas, Omegas) {
                # retrieve dimensions
                n <- tail(dim(X), 1)        # sample size
                p <- head(dim(X), -1)       # predictor dimensions
                r <- length(p)              # single predictor/response tensor order
                ## "Inverse" Link: Tensor Normal Specific
                # known exponential family constants
                c1 <- 1
                c2 <- -0.5
                # Covariances from the GLM parameter Scatter matrices
                Deltas <- Map(solve, Omegas)
                # First moment via "inverse" link `g1(eta_y) = E[X | Y = y]`
                E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
                # Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]`
                dim(E1) <- c(prod(p), n)
                E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1))
                ## end "Inverse" Link
                dim(X) <- c(prod(p), n)
                # Residuals
                R <- X - E1
                dim(R) <- c(p, n)
                # mean deviation between the sample covariance to GLM estimated covariance
                # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
                S <- rowMeans(colKronecker(X, X)) - E2  # <- Optimized for Tensor Normal
                dim(S) <- c(p, p)           # reshape to tensor or order `2 r`
                # Gradients of the negative log-likelihood scaled by sample size
                list(
                    "Dl(eta1)" = -c1 * rowMeans(R, dims = r),
                    "Dl(alphas)" = Map(function(j) {
                        (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
                    }, 1:r),
                    "Dl(Omegas)" = Map(function(j) {
                        deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
                        # addapt to symmetric constraint for the derivative
                        dim(deriv) <- c(p[j], p[j])
                        deriv + t(deriv * (1 - diag(p[j])))
                    }, 1:r)
                )
            }
            # mean conditional Fisher Information
            fisher.info <- function(Fy, eta1, alphas, Omegas) {
                # retrieve dimensions
                n <- tail(dim(Fy), 1)       # sample size
                p <- dim(eta1)              # predictor dimensions
                q <- head(dim(Fy), -1)      # response dimensions
                r <- length(p)              # single predictor/response tensor order
                # Hij = Cov(ti(X) %x% tj(X) | Y = y), i, j = 1, 2
                H11 <- Reduce(`%x%`, rev(Map(solve, Omegas)))   # covariance
                # 3rd central moment is zero
                H12 <- H21 <- 0
                # 4th moment by "Isserlis' theorem" a.k.a. "Wick's theorem"
                H22 <- kronecker(H11, H11)
                dim(H22) <- rep(prod(p), 4)
                H22 <- H22 + aperm(H22, c(1, 3, 2, 4)) + aperm(H22, c(1, 3, 2, 4))
                dim(H11) <- c(p, p)
                dim(H22) <- c(p, p, p, p)
                ## Fisher Information: Tensor Normal Specific
                # known exponential family constants
                c1 <- 1
                c2 <- -0.5
                # list of (Fy x_{k in [r]\j} alpha_j)
                Gys <- Map(function(j) {
                    mlm(Fy, alphas[-j], (1:r)[-j])
                }, 1:r)
                # setup indices of lower triangular matrix elements used for
                # all tuples (j, l) with 1 <= j <= l <= r.
                ltri <- lower.tri(diag(r), TRUE)
                J <- .col(c(r, r))[ltri]
                L <- .row(c(r, r))[ltri]
                # inverse perfect outer shuffle of `2 r` elements
                iShuf <- rep(1:r, each = 2) + c(0, r)
                # Getter for the i'th observation from Gys[[j]]
                # TODO: this is ugly, find a better (but still dynamic) version
                GyGet <- function(i, j) {
                    obs <- eval(str2lang(paste(
                        "Gys[[j]][",
                        paste(rep(",", r), collapse = ""),
                        "i, drop = FALSE]"
                    , collapse = "")))
                    dim(obs) <- head(dim(obs), -1)
                    obs
                }
                I11 <- c1^2 * mat(H11, 1:r)
                I12 <- Map(function(j) {
                    i11 <- ttt(Gys[[j]], H11, (1:r)[-j])
                    # take mean with respect to observations (second mode)
                    i11 <- colMeans(mat(i11, 2))
                    dim(i11) <- c(q[j], p[j], p)
                    t(mat(i11, 2:1))
                }, 1:r)
                I13 <- Map(matrix, 0, prod(p), p^2)
                I22 <- Map(function(j, l) {
                    i22 <- 0
                    for (i in 1:n) {
                        i22 <- i22 + ttt(
                            ttt(GyGet(i, j), H11, (1:r)[-j]),
                            GyGet(i, l), (1:r)[-l] + 2, (1:r)[-l])
                    }
                    (c1^2 / n) * mat(i22, 2:1)
                }, J, L)
                I23 <- Map(matrix, 0, (p * q)[J], (p^2)[L])
                # shuffled H22
                sH22 <- c2^2 * aperm(H22, c(iShuf, iShuf + 2 * r))
                dim(sH22) <- c(p^2, p^2)
                I33 <- Map(function(j, l) {
                    # second derivative (without constraints)
                    deriv <- mlm(mlm(sH22, Map(c, Omegas[-j]), (1:r)[-j],
                                     transposed = TRUE),
                                 Map(c, Omegas[-l]), (1:r)[-l] + r,
                                 transposed = TRUE)
                    dim(deriv) <- c(p[j], p[j], p[l], p[l])
                    # enforce symmetry of `Omega_j`
                    of.diag <- (slice.index(deriv, 1:2) != slice.index(deriv, 2:1))
                    deriv <- deriv + of.diag * aperm(deriv, c(2, 1, 3, 4))
                    # as well as the symmetry of `Omega_l`
                    of.diag <- (slice.index(deriv, 3:4) != slice.index(deriv, 4:3))
                    deriv <- deriv + of.diag * aperm(deriv, c(1, 2, 4, 3))
                    # matricize and return
                    dim(deriv) <- c(p[j]^2, p[l]^2)
                    deriv
                }, J, L)
                names(I12) <- sprintf("I(eta1, alpha_%d)", 1:r)
                names(I13) <- sprintf("I(eta1, Omega_%d)", 1:r)
                names(I22) <- sprintf("I(alpha_%d, alpha_%d)", J, L)
                names(I23) <- sprintf("I(alpha_%d, Omega_%d)", J, L)
                names(I33) <- sprintf("I(Omega_%d, Omega_%d)", J, L)
                list(I11 = I11, I12 = I12, I13 = I13, I22 = I22, I23 = I23, I33 = I33)
            }
            # Hessian of the scaled negative log-likelihood
            hessian <- function(X, Fy, eta1, alphas, Omegas) {
                stop("Not Implemented")
            }
        },
        ########################################################################
        ###                     Multi-Variate Bernoulli                      ###
        ########################################################################
        bernoulli = {
            require(mvbernoulli)
            initialize <- function(X, Fy) {
                p <- head(dim(X), -1)
                r <- length(p)
                # Mode-Covariances
                XSigmas <- mcov(X, sample.axis = r + 1L)
                YSigmas <- mcov(Fy, sample.axis = r + 1L)
                # Extract main mode covariance directions
                # Note: (the directions are transposed!)
                XDirs <- Map(function(Sigma) {
                    SVD <- La.svd(Sigma, nu = 0)
                    sqrt(SVD$d) * SVD$vt
                }, XSigmas)
                YDirs <- Map(function(Sigma) {
                    SVD <- La.svd(Sigma, nu = 0)
                    sqrt(SVD$d) * SVD$vt
                }, YSigmas)
                alphas <- Map(function(xdir, ydir) {
                    s <- min(ncol(xdir), nrow(ydir))
                    crossprod(xdir[seq_len(s), , drop = FALSE],
                              ydir[seq_len(s), , drop = FALSE])
                }, XDirs, YDirs)
                list(
                    eta1 = array(0, dim = p),
                    alphas = alphas,
                    Omegas = Map(diag, p)
                )
            }
            params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
                # number of observations
                n <- tail(dim(Fy), 1)
                # natural exponential family parameters
                eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1))
                eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas))
                # # next the conditional Ising model parameters `theta_y`
                # theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n)
                # dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n)
                # ltri <- which(lower.tri(eta_y2, diag = TRUE))
                # diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
                # theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1)
                # theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ]
                # conditional Ising model parameters
                theta_y <- matrix(rep(vech(eta_y2), n), ncol = n)
                ltri <- which(lower.tri(eta_y2, diag = TRUE))
                diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
                theta_y[diagonal, ] <- eta_y1
                theta_y
            }
            # Scaled ngative log-likelihood
            log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
                # number of observations
                n <- tail(dim(X), 1L)
                # conditional Ising model parameters
                theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
                # convert to binary data set
                storage.mode(X) <- "integer"
                X.mvb <- as.mvbinary(mat(X, length(dim(X))))
                # log-likelihood of the data set
                -mean(sapply(seq_len(n), function(i) {
                    ising_log_likelihood(theta_y[, i], X.mvb[i])
                }))
            }
            # Gradient of the scaled negative log-likelihood
            grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
                # retrieve dimensions
                n <- tail(dim(X), 1)        # sample size
                p <- head(dim(X), -1)       # predictor dimensions
                r <- length(p)              # single predictor/response tensor order
                ## "Inverse" Link: Ising Model Specific
                # conditional Ising model parameters: `p (p + 1) / 2` by `n`
                theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
                # conditional expectations
                #   ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y]
                E2 <- apply(theta_y, 2L, ising_marginal_probs)
                #   convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y]
                E2 <- E2[vech.pinv.index(prod(p)), ]
                #   extract diagonal elements which are equal to E[vec(X) | Y = y]
                E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ]
                ## end "Inverse" Link
                dim(X) <- c(prod(p), n)
                # Residuals
                R <- X - E1
                dim(R) <- c(p, n)
                # mean deviation between the sample covariance to GLM estimated covariance
                # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
                S <- rowMeans(colKronecker(X, X) - E2)
                dim(S) <- c(p, p)           # reshape to tensor or order `2 r`
                # Gradients of the negative log-likelihood scaled by sample size
                list(
                    "Dl(eta1)" = -c1 * rowMeans(R, dims = r),
                    "Dl(alphas)" = Map(function(j) {
                        (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
                    }, 1:r),
                    "Dl(Omegas)" = Map(function(j) {
                        deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
                        # addapt to symmetric constraint for the derivative
                        dim(deriv) <- c(p[j], p[j])
                        deriv + t(deriv * (1 - diag(p[j])))
                    }, 1:r)
                )
            }
            # Hessian of the scaled negative log-likelihood
            hessian <- function(X, Fy, alphas, Omegas, c1 = 1, c2 = 1) {
                stop("Not Implemented")
            }
            # Conditional Fisher Information
            fisher.info <- function(Fy, alphas, Omegas) {
                stop("Not Implemented")
            }
        }
    )
    list(
        family = name,
        initialize = initialize,
        params = params,
        # linkinv = linkinv,
        log.likelihood = log.likelihood,
        grad = grad,
        hessian = hessian,
        fisher.info = fisher.info
    )
 }
--- a/tensorPredictors/R/matrixImage.R
+++ b/tensorPredictors/R/matrixImage.R
@ -15,35 +15,43 @@
 #'
 #' @export
 matrixImage <- function(A, add.values = FALSE,
-    main = NULL, sub = NULL, interpolate = FALSE, ...,
+    main = NULL, sub = NULL, interpolate = FALSE, ..., zlim = NA,
    axes = TRUE, asp = 1, col = hcl.colors(24, "YlOrRd", rev = FALSE),
    digits = getOption("digits")
 ) {
    # plot raster image
-    plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "red",
+    plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "black",
-        xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main, sub = sub)
+        xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main, sub = sub,
        asp = asp)
    # Scale values of `A` to [0, 1] with min mapped to 1 and max to 0.
-    S <- (max(A) - A) / diff(range(A))
+    if (missing(zlim)) {
        S <- (max(A) - A) / diff(range(A))
    } else {
        S <- pmin(pmax(0, (zlim[2] - A) / diff(zlim)), 1)
    }
    # and not transform to color
    S <- matrix(col[round((length(col) - 1) * S + 1)], nrow = nrow(A))
    # Raster Image ploting the matrix with element values mapped to grayscale
    # as big elements (original matrix A) are dark and small (negative) elements
    # are white.
    rasterImage(S, 0, 0, ncol(A), nrow(A), interpolate = interpolate, ...)
-    # Add X-axis giving index
+    # X/Y axes index (matches coordinates to matrix indices)
    x <- seq(1, ncol(A), by = 1)
    axis(1, at = x - 0.5, labels = x, lwd = 0, lwd.ticks = 1)
    # Add Y-axis
    y <- seq(1, nrow(A))
-    axis(2, at = y - 0.5, labels = rev(y), lwd = 0, lwd.ticks = 1, las = 1)
+    if (axes) {
        axis(1, at = x - 0.5, labels = x, lwd = 0, lwd.ticks = 1)
        axis(2, at = y - 0.5, labels = rev(y), lwd = 0, lwd.ticks = 1, las = 1)
    }
-    # Writes matrix values (in colored element grids)
+    # Writes matrix values
    if (any(add.values)) {
        if (length(add.values) > 1) {
            A[!add.values] <- NA
            A[add.values] <- format(A[add.values], digits = digits)
        }
-        text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A,
+        text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A, adj = 0.5)
             adj = 0.5, col = as.integer(S > 0.65))
    }
 }
--- a/tensorPredictors/R/num_deriv.R
+++ b/tensorPredictors/R/num_deriv.R
@ -3,34 +3,101 @@
 #' @example inst/examples/num_deriv.R
 #'
 #' @export
-num.deriv <- function(F, X, h = 1e-6, sym = FALSE) {
+num.deriv <- function(expr, ..., h = 1e-6, sym = FALSE) {
    sexpr <- substitute(expr)
    if (...length() != 1) {
        stop("Expectd exactly one '...' variable")
    }
    var <- ...names()[1]
    if (is.null(var)) {
        arg <- substitute(...)
        var <- if (is.symbol(arg)) as.character(arg) else "x"
    }
    if (is.language(sexpr) && !is.symbol(sexpr) && sexpr[[1]] == as.symbol("function")) {
        func <- expr
    } else {
        if (is.name(sexpr)) {
            expr <- call(as.character(sexpr), as.name(var))
        } else {
            if ((!is.call(sexpr) && !is.expression(sexpr))
            || !(var %in% all.vars(sexpr))) {
                stop("'expr' must be a function or expression containing '", var, "'")
            }
            expr <- sexpr
        }
        args <- as.pairlist(structure(list(alist(x = )[[1]]), names = var))
        func <- as.function(c(args, expr), envir = parent.frame())
    }
    num.deriv.function(func, ..1, h = h, sym = sym)
 }
 #' @rdname num.deriv
 #' @export
 num.deriv.function <- function(FUN, X, h = 1e-6, sym = FALSE) {
    if (sym) {
        stopifnot(isSymmetric(X))
        p <- nrow(X)
        k <- seq_along(X) - 1
        mapply(function(i, j) {
            dx <- h * ((k == i * p + j) | (k == j * p + i))
-            (F(X + dx) - F(X - dx)) / (2 * h)
+            (FUN(X + dx) - FUN(X - dx)) / (2 * h)
        }, .row(dim(X)) - 1, .col(dim(X)) - 1)
    } else {
        sapply(seq_along(X), function(i) {
            dx <- h * (seq_along(X) == i)
-            (F(X + dx) - F(X - dx)) / (2 * h)
+            (FUN(X + dx) - FUN(X - dx)) / (2 * h)
        })
    }
 }
-#' Second numeric derivative
+#' @rdname num.deriv
 #'
 #' @export
-num.deriv2 <- function(F, X, Y, h = 1e-6, symX = FALSE, symY = FALSE) {
+num.deriv2 <- function(FUN, X, Y, h = 1e-6, symX = FALSE, symY = FALSE) {
    if (missing(Y)) {
-        num.deriv(function(x) {
+        num.deriv.function(function(x) {
-            num.deriv(function(z) F(z), x, h = h, sym = symX)
+            num.deriv.function(FUN, x, h = h, sym = symX)
        }, X, h = h, sym = symX)
    } else {
-        num.deriv(function(y) {
+        num.deriv.function(function(y) {
-            num.deriv(function(x) F(x, y), X, h = h, sym = symX)
+            num.deriv.function(function(x) FUN(x, y), X, h = h, sym = symX)
        }, Y, h = h, sym = symY)
    }
 }
 ### Interface Idea / Demo
 # num.deriv2.function
 # num.deriv2 <- function(expr, var) {
 #     sexpr <- substitute(expr)
 #     svar <- substitute(var)
 #
 #     if (is.language(sexpr) && !is.symbol(sexpr) && sexpr[[1]] == as.symbol("function")) {
 #         func <- expr
 #     } else {
 #         if (is.name(sexpr)) {
 #             expr <- call(as.character(sexpr), as.name(svar))
 #         } else {
 #             if ((!is.call(sexpr) && !is.expression(sexpr))
 #             || !(as.character(svar) %in% all.vars(sexpr))) {
 #                 stop("'expr' must be a function or expression containing '",
 #                     as.character(svar), "'")
 #             }
 #             expr <- sexpr
 #         }
 #
 #         args <- as.pairlist(structure(list(alist(x = )[[1]]), names = as.character(svar)))
 #         func <- as.function(c(args, expr), envir = parent.frame())
 #     }
 #
 #     num.deriv2.function(func, var)
 # }
 # y <- c(pi, exp(1), (sqrt(5) + 1) / 2)
 # num.deriv2(function(x) x^2 + y, x)(1:3)
 # num.deriv2(x^2 + y, x)(1:3)
 # func <- function(x) x^2 + y
 # num.deriv2(func, x)(1:3)
 # func2 <- function(z = y, x) x^2 + z
 # num.deriv2(func2, x)(1:3)
--- a/tensorPredictors/inst/examples/num_deriv.R
+++ b/tensorPredictors/inst/examples/num_deriv.R
@ -4,15 +4,17 @@ A <- A + t(A)
 B <- matrix(rnorm(3 * 4), 3, 4)
 # F(A) = A B
-# DF(A) = B' x I
+# DF(A) = B' %x% I
-stopifnot(all.equal(
+exprs.all.equal({
    num.deriv(function(X) X %*% B, A),
    t(B) %x% diag(nrow(A))
-))
+    num.deriv(function(X) X %*% B, A)
    num.deriv(A %*% B, A)
    num.deriv(X %*% B, X = A)
 })
 # Symmetric case, constraint A = A' (equiv to being a function of vech(A) only)
 # F(A) = A B     for A = A'
-# DF(A) = B' x I
+# DF(A) = B' %x% I
 stopifnot(all.equal(
    num.deriv(function(X) X %*% B, A, sym = TRUE),
    (t(B) %x% diag(nrow(A))) %*% D(nrow(A)) %*% t(D(nrow(A)))
--- a/tensorPredictors/tests/testthat.R
+++ b/tensorPredictors/tests/testthat.R
@ -0,0 +1,4 @@
 library(testthat)
 library(tensorPredictors)
 test_check("tensorPredictors")
--- a/tensorPredictors/tests/testthat/test-MatrixCalculus.R
+++ b/tensorPredictors/tests/testthat/test-MatrixCalculus.R
@ -0,0 +1,50 @@
 # F(X) = log(det(X))    if det(X) > 0
 # dF(X) = tr(X^-1 dX)
 # DF(X) = vec((X^-1)')'
 test_that("Matrix Calculus 1", {
    # test data
    X <- tcrossprod(diag(5) + matrix(runif(5^2, -.1, .1), 5))
    num.grad <- num.deriv(function(X) log(det(X)), X)
    ana.grad <- c(solve(X))
    expect_equal(num.grad, ana.grad)
 })
 # F(mu) = <X - mu, (X - mu) x_{k in [r]} Delta_K^-1>    for Delta_k = Delta_k'
 # DF(mu) = -2 vec((X - mu) x_{k in [r]} Delta_K^-1)'
 test_that("Matrix Calculus 2", {
    p <- c(2, 3, 5)
    X <- array(rnorm(prod(p)), dim = p)
    mu <- array(rnorm(prod(p)), dim = p)
    Deltas <- Map(function(pj) tcrossprod(diag(pj) + runif(pj^2, -0.1, 0.1)), p)
    ana.grad <- -2 * c(mlm(X - mu, Map(solve, Deltas)))
    num.grad <- num.deriv(function(mu) sum((X - mu) * mlm(X - mu, Map(solve, Deltas))), mu)
    expect_equal(num.grad, ana.grad)
 })
 # F(Delta_j) = <X - mu, (X - mu) x_{k in [r]} Delta_K^-1>    for Delta_k = Delta_k'
 # DF(Delta_j) = -((X - mu) x_{k in [r]} Delta_K^-1)_(j) (X - mu)_(j)' Delta_j^-1
 test_that("Matrix Calculus 3", {
    # config
    p <- c(2, 3, 5)
    # generate test data
    X <- array(rnorm(prod(p)), dim = p)
    mu <- array(rnorm(prod(p)), dim = p)
    Deltas <- Map(function(pj) tcrossprod(diag(pj) + runif(pj^2, -0.1, 0.1)), p)
    # check analytic to numeric derivatives
    for (j in seq_along(Deltas)) {
        num.grad <- matrix(num.deriv(function(Delta_j) {
                Deltas[[j]] <- Delta_j
                sum((X - mu) * mlm(X - mu, Map(solve, Deltas)))
            }, Deltas[[j]]), p[j])
        ana.grad <- -mcrossprod(mlm(X - mu, Map(solve, Deltas)), X - mu, j) %*% solve(Deltas[[j]])
        expect_equal(num.grad, ana.grad)
    }
 })
--- a/tensorPredictors/tests/testthat/test-TensorNormalFamily.R
+++ b/tensorPredictors/tests/testthat/test-TensorNormalFamily.R
@ -0,0 +1,186 @@
 test_that("Tensor-Normal Score", {
    # setup dimensions
    n <- 11L
    p <- c(3L, 5L, 4L)
    q <- c(2L, 7L, 3L)
    # create "true" GLM parameters
    eta1 <- array(rnorm(prod(p)), dim = p)
    alphas <- Map(matrix, Map(rnorm, p * q), p)
    Omegas <- Map(function(pj) {
        solve(0.5^abs(outer(seq_len(pj), seq_len(pj), `-`)))
    }, p)
    params <- list(eta1, alphas, Omegas)
    # compute tensor normal parameters from GLM parameters
    Deltas <- Map(solve, Omegas)
    mu <- mlm(eta1, Deltas)
    # sample some test data
    sample.axis <- length(p) + 1L
    Fy <- array(rnorm(n * prod(q)), dim = c(q, n))
    X <- mlm(Fy, Map(`%*%`, Deltas, alphas)) + rtensornorm(n, mu, Deltas, sample.axis)
    # Create a GLM family object
    family <- make.gmlm.family("normal")
    # first unpack the family object
    log.likelihood <- family$log.likelihood
    grad           <- family$grad
    # compare numeric gradient against the analytic gradient provided by the
    # family at the initial parameters
    grad.num <- do.call(function(eta1, alphas, Omegas) {
        list(
            "Dl(eta1)" = num.deriv.function(function(eta1) {
                log.likelihood(X, Fy, eta1, alphas, Omegas)
            }, eta1),
            "Dl(alphas)" = Map(function(j) num.deriv.function(function(alpha_j) {
                alphas[[j]] <- alpha_j
                log.likelihood(X, Fy, eta1, alphas, Omegas)
            }, alphas[[j]]), seq_along(alphas)),
            "Dl(Omegas)" = Map(function(j) num.deriv.function(function(Omega_j) {
                Omegas[[j]] <- Omega_j
                log.likelihood(X, Fy, eta1, alphas, Omegas)
            }, Omegas[[j]], sym = TRUE), seq_along(Omegas))
        )
    }, params)
    grad.ana <- do.call(grad, c(list(X, Fy), params))
    expect_equal(RMap(c, grad.ana), grad.num, tolerance = 1e-6)
    # test for correct dimensions
    ana.dims <- list(
        "Dl(eta1)" = p,
        "Dl(alphas)" = Map(c, p, q),
        "Dl(Omegas)" = Map(c, p, p)
    )
    expect_identical(RMap(dim, grad.ana), ana.dims)
 })
 test_that("Tensor-Normal moments of the sufficient statistic t(X)", {
    # config
    # # (estimated) sample size for sample estimates
    # n <- 250000   # sample estimates are too unreliable for testing purposes
    p <- c(2L, 3L)
    r <- length(p)
    # setup tensor normal parameters (NO dependence of Fy, set to zero)
    mu <- array(runif(prod(p), min = -0.5, max = 0.5), dim = p)    # = mu_y
    Deltas <- Map(function(pj) 0.5^abs(outer(1:pj, 1:pj, `-`)), p)
    # compute GMLM parameters (NO alphas as Fy is zero)
    Omegas <- Map(solve, Deltas)
    eta1 <- mlm(mu, Omegas)
    # natural parameters of the tensor normal
    eta_y1 <- eta1 # + mlm(Fy, alphas) = + 0
    eta_y2 <- -0.5 * Reduce(`%x%`, rev(Omegas))
    # draw a sample
    X <- rtensornorm(n, mu, Deltas, r + 1L)
    # tensor normal log-partition function given eta_y
    log.partition <- function(eta_y1, eta_y2) {
        # eta_y1 as "model" matrix of dimensions `n by p` where `n` might be `1`
        if (length(eta_y1)^2 == length(eta_y2)) {
            pp <- length(eta_y1)
            dim(eta_y1) <- c(1L, pp)
        } else {
            eta_y1 <- mat(eta_y1, r + 1L)
            pp <- ncol(eta_y1)
        }
        # treat eta_y2 as square matrix
        if (!is.matrix(eta_y2)) {
            dim(eta_y2) <- c(pp, pp)
        }
        # evaluate log-partiton function in terms of natural parameters
        -0.25 * pp * mean((eta_y1 %*% solve(eta_y2)) * eta_y1) - 0.5 * log(det(-2 * eta_y2))
    }
    # (analytic) first and second (un-centered) moment of the tensor normal
    #   Eti = E[ti(X) | Y = y]
    Et1.ana <- mu
    Et2.ana <- Reduce(`%x%`, rev(Deltas)) + outer(c(mu), c(mu))
    # (numeric) derivative of the log-partition function with respect to the natural
    # parameters of the exponential family form of the tensor normal
    Et1.num <- num.deriv(log.partition(eta_y1, eta_y2), eta_y1)
    Et2.num <- num.deriv(log.partition(eta_y1, eta_y2), eta_y2)
    # # (estimated) estimate from sample
    # Et1.est <- rowMeans(X, dims = r)
    # Et2.est <- colMeans(rowKronecker(mat(X, r + 1L), mat(X, r + 1L)))
    # (analytic) convariance blocks of the sufficient statistic
    #   Ctij = Cov(ti(X), tj(X) | Y = y)
    Ct11.ana <- Reduce(`%x%`, rev(Deltas))
    Ct12.ana <- local({
        # Analytic solution / `vec(mu %x% Delta) + vec(mu' %x% Delta)`
        ct12 <- 2 * c(mu) %x% Reduce(`%x%`, rev(Deltas))
        # and symmetrize!
        dim(ct12) <- rep(prod(p), 3)
        (1 / 2) * (ct12 + aperm(ct12, c(3, 1, 2)))
    })
    Ct22.ana <- local({
        Delta <- Reduce(`%x%`, rev(Deltas)) # Ct11.ana
        ct22 <- (2 * Delta + 4 * outer(c(mu), c(mu))) %x% Delta
        # TODO: What does this symmetrization do exactly? And why?!
        dim(ct22) <- rep(prod(p), 4)
        (1 / 4) * (
            aperm(ct22, c(2, 3, 1, 4)) +
            aperm(ct22, c(2, 3, 4, 1)) +
            aperm(ct22, c(3, 2, 1, 4)) +
            aperm(ct22, c(3, 2, 4, 1))
        )
    })
    # (numeric) second derivative of the log-partition function
    Ct11.num <- num.deriv2(function(eta_y1) log.partition(eta_y1, eta_y2), eta_y1)
    Ct12.num <- num.deriv2(log.partition, eta_y1, eta_y2)
    Ct22.num <- local({
        ct22 <- num.deriv2(function(eta_y2) log.partition(eta_y1, eta_y2), eta_y2)
        # TODO: Why does this need to be symmetrized?!
        dim(ct22) <- rep(prod(p), 4)
        (1 / 4) * (
            aperm(ct22, c(3, 4, 2, 1)) +
            aperm(ct22, c(4, 3, 2, 1)) +
            aperm(ct22, c(3, 4, 1, 2)) +
            aperm(ct22, c(4, 3, 1, 2))
        )
    })
    # # (estimated) sample estimates of the sufficient statistic convariance blocks
    # T1 <- mat(X, r + 1L)
    # T2 <- rowKronecker(T1, T1)
    # Ct11.est <- cov(T1)
    # Ct12.est <- cov(T1, T2)
    # Ct22.est <- cov(T2, T2)
    tolerance <- 1e-3
    expect_equal(c(Et1.ana), c(Et1.num), tolerance = tolerance)
    # expect_equal(c(Et1.ana), c(Et1.est), tolerance = 0.05, scale = 1)
    # expect_equal(c(Et1.num), c(Et1.est), tolerance = 0.05, scale = 1)
    expect_equal(c(Et2.ana), c(Et2.num), tolerance = tolerance)
    # expect_equal(c(Et2.ana), c(Et2.est), tolerance = 0.05, scale = 1)
    # expect_equal(c(Et2.num), c(Et2.est), tolerance = 0.05, scale = 1)
    expect_equal(c(Ct11.ana), c(Ct11.num), tolerance = tolerance)
    # expect_equal(c(Ct11.ana), c(Ct11.est), tolerance = 0.05, scale = 1)
    # expect_equal(c(Ct11.num), c(Ct11.est), tolerance = 0.05, scale = 1)
    expect_equal(c(Ct12.ana), c(Ct12.num), tolerance = tolerance)
    # expect_equal(c(Ct12.ana), c(Ct12.est), tolerance = 0.05, scale = 1)
    # expect_equal(c(Ct12.num), c(Ct12.est), tolerance = 0.05, scale = 1)
    expect_equal(c(Ct22.ana), c(Ct22.num), tolerance = tolerance)
    # expect_equal(c(Ct22.ana), c(Ct22.est), tolerance = 0.05, scale = 1)
    # expect_equal(c(Ct22.num), c(Ct22.est), tolerance = 0.05, scale = 1)
 })