fix: some typos in GMLM.tex,

add: extended interface of num.deriv and matrixImage, add: "tests", add: moved make.gmlm.family into seperate file
2022-10-31 15:14:58 +01:00 · 2022-10-31 15:14:58 +01:00 · 2dfdc7083b
commit 2dfdc7083b
parent 79f16c58dc
11 changed files with 746 additions and 298 deletions
--- a/LaTeX/GMLM.tex
+++ b/LaTeX/GMLM.tex
@ -139,11 +139,12 @@

 \maketitle

+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{abstract}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-    We consider regression and classification for \textit{general} response and tensor-valued predictors (multi dimensional arrays) and propose a \textit{novel formulation} for  method for sufficient dimension reduction.  Assumingof Tensor-valued predictor (multi dimensional arrays) for regression or classification. the distribution of the tensor-valued predictors given the response is in theWe assume an qQuadratic eExponential fFamily, we model the natural parameter   for a Generalized Linear Model in an inverse regression setting where the relation via a link is of as a multi-linear function of the responsnature.
-    ThisUsing a multi-linear relation allows to perform per-axis reductions that drastically which reduces the total number of parameters drastically for higher order tTensor-valued predictors. WUnder the Exponential Family we derive maximum likelihood estimates for the multi-linear sufficient dimension reduction and  of the Tensor-valued predictors. Furthermore, we provide a computationally efficient n estimation algorithm which leveragutilizes the tTensor structure allowing efficient implementations. The performance of the method is illustrated via simulations and real world examples are provided.
+    We consider regression and classification for \textit{general} response and tensor-valued predictors (multi dimensional arrays) and propose a \textit{novel formulation} for sufficient dimension reduction. Assuming the distribution of the tensor-valued predictors given the response is in the quadratic exponential family, we model the natural parameter as a multi-linear function of the respons.
+    This allows per-axis reductions that drastically reduce the total number of parameters for higher order tensor-valued predictors. We derive maximum likelihood estimates for the sufficient dimension reduction and a computationally efficient estimation algorithm which leveraes the tensor structure. The performance of the method is illustrated via simulations and real world examples are provided.
 \end{abstract}

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -523,13 +524,13 @@ The next step is to identify the Hessians from the second differentials in a sim
            &\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \vec\biggl( (\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j) \Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))} \biggr) \\
        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big)_{(j)}\otimes\mat{I}_{p_j} \biggr) \mat{K}_{\mat{p},(j)}\mat{H}_{1,1}\t{\mat{K}_{\mat{p},(l)}} \biggl( \t{\Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big)_{(l)}}\otimes\mat{I}_{p_l} \biggr)\vec{\d\mat{\alpha}_l} \\
            &\qquad + c_1 (\t{(\vec{\ten{X}})} - \D b(\mat{\eta}_{y,1})) \t{\mat{K}_{\mat{p},((j,l))}} \biggl( \t{\Big( \ten{F}_y\mlm{k\in[r]\backslash\{j,l\}}\mat{\alpha}_k \Big)_{((j,l))}}\otimes\mat{I}_{p_j p_l} \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\
-        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \vec{\d\mat{\alpha}_l} \\
+        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \vec{\d\mat{\alpha}_l} \\
            &\qquad + c_1 \vec\biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr) \vec{(\d\mat{\alpha}_l\otimes\d\mat{\alpha}_j)} \\
-        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \vec{\d\mat{\alpha}_l} \\
+        &= -c_1^2 \t{(\vec{\d\mat{\alpha}_j})} \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \vec{\d\mat{\alpha}_l} \\
            &\qquad + c_1 \t{(\vec{\d\mat{\alpha}_j})} \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3))} \vec{\d\mat{\alpha}_l} \\
                &\qquad \begin{aligned}
                    \Rightarrow \frac{\partial l}{\partial(\vec{\mat{\alpha}_j})\t{\partial(\vec{\mat{\alpha}_l})}} &=
-                            -c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{(2,1)} \\
+                            -c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\
                        &\qquad + c_1 \biggl( (\ten{X} - \ten{D}_1) \ttt_{[r]\backslash\{j,l\}} \Big( \ten{F}_y\mlm{k\neq j,l}\mat{\alpha}_k \Big) \biggr)_{((1,3) + [[j > l]])}
                        \qquad{\color{gray} (p_j q_j \times p_l q_l)}
                \end{aligned}
@ -591,22 +592,23 @@ and with that all auxiliary calculation are done. Using the two expectations yie
        \mathcal{I}_{2 r + 1,1} & \mathcal{I}_{2 r + 1,1} & \cdots & \mathcal{I}_{2 r + 1, 2 r + 1}
    \end{pmatrix}
 \end{displaymath}
-for $j, l = 1, ..., r$ as follows
+and for every block holds $\mathcal{I}_{j, l} = \t{\mathcal{I}_{l, j}}$. The individual blocks are given with $j = 1, ..., r$ and $j \leq l \leq r$ by
 \begin{align*}
    \mathcal{I}_{1,1} &= c_1^2 (\ten{H}_{1,1})_{([r])} \\
-    \mathcal{I}_{1,j+1} = \t{\mathcal{I}_{j+1,1}}
+    \mathcal{I}_{1,j+1} % = \E\partial_{\vec{\overline{\ten{\eta}}_1}}\partial_{\t{(\vec{\mat{\alpha}_j})}} l(\mat{\Theta})\mid \ten{Y} = y
        &= c_1^2 \Big[\Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1}\Big]_{((2, 1))} \\
-    \mathcal{I}_{1,j+1} = \t{\mathcal{I}_{j+1,1}}
+    \mathcal{I}_{1,j+r+1}
        &= c_1 c_2 \Big( \ten{R}_{[2r]}(\ten{H}_{2,1}) \mlm{k\in[r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \Big)_{(j)} \\
-    \mathcal{I}_{j+1,l+1} = \t{\mathcal{I}_{l+1,j+1}}
+    \mathcal{I}_{j+1,l+1}
        &= c_1^2 \biggl( \Big[ \Big(\ten{F}_y\mlm{k\in[r]\backslash j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j} \ten{H}_{1,1} \Big] \ttt_{[r]\backslash l + 2}^{[r]\backslash l} \Big(\ten{F}_y\mlm{k\in[r]\backslash l}\mat{\alpha}_k\Big) \biggr)_{((2,1))} \\
-    \mathcal{I}_{j+1,l+r+1} = \t{\mathcal{I}_{l+r+1,j+1}}
+    \mathcal{I}_{j+1,l+r+1}
        &= c_1 c_2 \biggl( \Big(\ten{F}_y\mlm{k\neq j}\mat{\alpha}_k\Big) \ttt_{[r]\backslash j}^{[r]\backslash j + r} \Bigl(\ten{R}_{[2r]}(\ten{H}_{2,1})\mlm{k\neq l}\t{(\vec{\mat{\Omega}_k})}\Bigr) \biggr)_{((r + 2, 1))} \\
-    \mathcal{I}_{j+r+1,l+r+1} = \t{\mathcal{I}_{l+r+1,j+r+1}}
+    \mathcal{I}_{j+r+1,l+r+1}
        &= c_2^2 \Big(\ten{R}_{([2r],[2r]+2r)}(\ten{H}_{2,2}) \mlm{k\in [r]\backslash j}\t{(\vec{\mat{\Omega}_k})} \mlm{\substack{k + r \\ k\in [r]\backslash l}}\t{(\vec{\mat{\Omega}_k})}\Big)_{(j)}
 \end{align*}


+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                  \section{Vectorization and Matricization}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -615,7 +617,7 @@ The \emph{matricization} is a generalization of the \emph{vectorization} operati
 \begin{theorem}\label{thm:mlm_mat}
    Let $\ten{A}$ be a tensor of order $r$ with the dimensions $q_1\times ... \times q_r$. Furthermore, let for $k = 1, ..., r$ be $\mat{B}_k$ matrices of dimensions $p_k\times q_k$. Then, for any $(\mat{i}, \mat{j})\in\perm{r}$ holds
    \begin{displaymath}
-        (\ten{A}\times_{k\in[r]}\mat{B}_k)_{(\mat{i}, \mat{j})}
+        \Big(\ten{A}\mlm{k\in[r]}\mat{B}_k\Big)_{(\mat{i}, \mat{j})}
            = \Big(\bigotimes_{k = \len{\mat{i}}}^{1}\mat{B}_{\mat{i}_k}\Big) \ten{A}_{(\mat{i}, \mat{j})} \Big(\bigotimes_{k = \len{\mat{j}}}^{1}\t{\mat{B}_{\mat{j}_k}}\Big).
    \end{displaymath}
 \end{theorem}
--- a/tensorPredictors/NAMESPACE
+++ b/tensorPredictors/NAMESPACE
@ -47,6 +47,7 @@ export(mkm)
 export(mlm)
 export(mtvk)
 export(num.deriv)
+export(num.deriv.function)
 export(num.deriv2)
 export(reduce)
 export(rowKronecker)
--- a/tensorPredictors/R/GMLM.R
+++ b/tensorPredictors/R/GMLM.R
@ -5,265 +5,6 @@ GMLM <- function(...) {
    stop("Not Implemented")
 }

-
-make.gmlm.family <- function(name) {
-    # standardize family name
-    name <- list(
-        normal = "normal", gaussian = "normal",
-        bernoulli = "bernoulli", ising = "bernoulli"
-    )[[tolower(name), exact = FALSE]]
-
-    ############################################################################
-    #                                                                          #
-    #    TODO: better (and possibly specialized) initial parameters!?!?!?!     #
-    #                                                                          #
-    ############################################################################
-
-    switch(name,
-        normal = {
-            initialize <- function(X, Fy) {
-                p <- head(dim(X), -1)
-                r <- length(p)
-
-                # Mode-Covariances
-                XSigmas <- mcov(X, sample.axis = r + 1L)
-                YSigmas <- mcov(Fy, sample.axis = r + 1L)
-
-                # Extract main mode covariance directions
-                # Note: (the directions are transposed!)
-                XDirs <- Map(function(Sigma) {
-                    SVD <- La.svd(Sigma, nu = 0)
-                    sqrt(SVD$d) * SVD$vt
-                }, XSigmas)
-                YDirs <- Map(function(Sigma) {
-                    SVD <- La.svd(Sigma, nu = 0)
-                    sqrt(SVD$d) * SVD$vt
-                }, YSigmas)
-
-                alphas <- Map(function(xdir, ydir) {
-                    s <- min(ncol(xdir), nrow(ydir))
-                    crossprod(xdir[seq_len(s), , drop = FALSE],
-                              ydir[seq_len(s), , drop = FALSE])
-                }, XDirs, YDirs)
-
-                list(
-                    eta1 = rowMeans(X, dims = r),
-                    alphas = alphas,
-                    Omegas = Map(diag, p)
-                )
-            }
-
-            # parameters of the tensor normal computed from the GLM parameters
-            params <- function(Fy, eta1, alphas, Omegas) {
-                Deltas <- Map(solve, Omegas)
-                mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
-                list(mu_y = mu_y, Deltas = Deltas)
-            }
-
-            # scaled negative log-likelihood
-            log.likelihood <- function(X, Fy, eta1, alphas, Omegas) {
-                n <- tail(dim(X), 1)        # sample size
-
-                # conditional mean
-                mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas))
-
-                # negative log-likelihood scaled by sample size
-                # Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)`
-                # migth fail, but `NAGD` has failsaves againt cases of "illegal"
-                # parameters.
-                suppressWarnings(
-                    0.5 * prod(p) * log(2 * pi) +
-                        sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) -
-                            (0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p)
-                )
-            }
-            # gradient of the scaled negative log-likelihood
-            grad <- function(X, Fy, eta1, alphas, Omegas) {
-                # retrieve dimensions
-                n <- tail(dim(X), 1)        # sample size
-                p <- head(dim(X), -1)       # predictor dimensions
-                r <- length(p)              # single predictor/response tensor order
-
-                ## "Inverse" Link: Tensor Normal Specific
-                # known exponential family constants
-                c1 <- 1
-                c2 <- -0.5
-
-                # Covariances from the GLM parameter Scatter matrices
-                Deltas <- Map(solve, Omegas)
-
-                # First moment via "inverse" link `g1(eta_y) = E[X | Y = y]`
-                E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
-
-                # Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]`
-                dim(E1) <- c(prod(p), n)
-                E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1))
-                ## end "Inverse" Link
-
-                dim(X) <- c(prod(p), n)
-
-                # Residuals
-                R <- X - E1
-                dim(R) <- c(p, n)
-
-                # mean deviation between the sample covariance to GLM estimated covariance
-                # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
-                S <- rowMeans(colKronecker(X, X)) - E2  # <- Optimized for Tensor Normal
-                dim(S) <- c(p, p)           # reshape to tensor or order `2 r`
-
-                # Gradients of the negative log-likelihood scaled by sample size
-                list(
-                    "Dl(eta1)" = -c1 * rowMeans(R, dims = r),
-                    "Dl(alphas)" = Map(function(j) {
-                        (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
-                    }, 1:r),
-                    "Dl(Omegas)" = Map(function(j) {
-                        deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
-                        # addapt to symmetric constraint for the derivative
-                        dim(deriv) <- c(p[j], p[j])
-                        deriv + t(deriv * (1 - diag(p[j])))
-                    }, 1:r)
-                )
-            }
-        },
-        bernoulli = {
-            require(mvbernoulli)
-
-            initialize <- function(X, Fy) {
-                p <- head(dim(X), -1)
-                r <- length(p)
-
-                # Mode-Covariances
-                XSigmas <- mcov(X, sample.axis = r + 1L)
-                YSigmas <- mcov(Fy, sample.axis = r + 1L)
-
-                # Extract main mode covariance directions
-                # Note: (the directions are transposed!)
-                XDirs <- Map(function(Sigma) {
-                    SVD <- La.svd(Sigma, nu = 0)
-                    sqrt(SVD$d) * SVD$vt
-                }, XSigmas)
-                YDirs <- Map(function(Sigma) {
-                    SVD <- La.svd(Sigma, nu = 0)
-                    sqrt(SVD$d) * SVD$vt
-                }, YSigmas)
-
-                alphas <- Map(function(xdir, ydir) {
-                    s <- min(ncol(xdir), nrow(ydir))
-                    crossprod(xdir[seq_len(s), , drop = FALSE],
-                              ydir[seq_len(s), , drop = FALSE])
-                }, XDirs, YDirs)
-
-                list(
-                    eta1 = array(0, dim = p),
-                    alphas = alphas,
-                    Omegas = Map(diag, p)
-                )
-            }
-
-            params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
-                # number of observations
-                n <- tail(dim(Fy), 1)
-
-                # natural exponential family parameters
-                eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1))
-                eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas))
-
-                # # next the conditional Ising model parameters `theta_y`
-                # theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n)
-                # dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n)
-
-                # ltri <- which(lower.tri(eta_y2, diag = TRUE))
-                # diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
-                # theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1)
-                # theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ]
-
-                # conditional Ising model parameters
-                theta_y <- matrix(rep(vech(eta_y2), n), ncol = n)
-                ltri <- which(lower.tri(eta_y2, diag = TRUE))
-                diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
-                theta_y[diagonal, ] <- eta_y1
-
-                theta_y
-            }
-
-            # Scaled ngative log-likelihood
-            log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
-                # number of observations
-                n <- tail(dim(X), 1L)
-
-                # conditional Ising model parameters
-                theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
-
-                # convert to binary data set
-                storage.mode(X) <- "integer"
-                X.mvb <- as.mvbinary(mat(X, length(dim(X))))
-
-                # log-likelihood of the data set
-                -mean(sapply(seq_len(n), function(i) {
-                    ising_log_likelihood(theta_y[, i], X.mvb[i])
-                }))
-            }
-
-            # Gradient of the scaled negative log-likelihood
-            grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
-                # retrieve dimensions
-                n <- tail(dim(X), 1)        # sample size
-                p <- head(dim(X), -1)       # predictor dimensions
-                r <- length(p)              # single predictor/response tensor order
-
-                ## "Inverse" Link: Ising Model Specific
-                # conditional Ising model parameters: `p (p + 1) / 2` by `n`
-                theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
-
-                # conditional expectations
-                #   ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y]
-                E2 <- apply(theta_y, 2L, ising_marginal_probs)
-                #   convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y]
-                E2 <- E2[vech.pinv.index(prod(p)), ]
-                #   extract diagonal elements which are equal to E[vec(X) | Y = y]
-                E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ]
-                ## end "Inverse" Link
-
-                dim(X) <- c(prod(p), n)
-
-                # Residuals
-                R <- X - E1
-                dim(R) <- c(p, n)
-
-                # mean deviation between the sample covariance to GLM estimated covariance
-                # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
-                S <- rowMeans(colKronecker(X, X) - E2)
-                dim(S) <- c(p, p)           # reshape to tensor or order `2 r`
-
-                # Gradients of the negative log-likelihood scaled by sample size
-                list(
-                    "Dl(eta1)" = -c1 * rowMeans(R, dims = r),
-                    "Dl(alphas)" = Map(function(j) {
-                        (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
-                    }, 1:r),
-                    "Dl(Omegas)" = Map(function(j) {
-                        deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
-                        # addapt to symmetric constraint for the derivative
-                        dim(deriv) <- c(p[j], p[j])
-                        deriv + t(deriv * (1 - diag(p[j])))
-                    }, 1:r)
-                )
-            }
-        }
-    )
-
-    list(
-        family = name,
-        initialize = initialize,
-        params = params,
-        # linkinv = linkinv,
-        log.likelihood = log.likelihood,
-        grad = grad
-    )
-}
-
-
 #' @export
 GMLM.default <- function(X, Fy, sample.axis = 1L,
    family = "normal",
@ -292,6 +33,7 @@ GMLM.default <- function(X, Fy, sample.axis = 1L,
        }
    }

+    # optimize likelihood using Nesterov Excelerated Gradient Descent
    params.fit <- NAGD(
        fun.loss = function(params) {
            # scaled negative lig-likelihood
--- a/tensorPredictors/R/exprs_all_equal.R
+++ b/tensorPredictors/R/exprs_all_equal.R
@ -24,7 +24,7 @@
 #'     matrix(1, 2, 3)
 #'     array(rep(1, 6), dim = c(2, 3))
 #' })
-#' # basicaly identical to
+#' # is identical to
 #' stopifnot(exprs = {
 #'     all.equal(matrix(rep(1, 6), 2, 3), matrix(1, 2, 3))
 #'     all.equal(matrix(rep(1, 6), 2, 3), array(rep(1, 6), dim = c(2, 3)))
--- a/tensorPredictors/R/make_gmlm_family.R
+++ b/tensorPredictors/R/make_gmlm_family.R
@ -0,0 +1,386 @@
+
+make.gmlm.family <- function(name) {
+    # standardize family name
+    name <- list(
+        normal = "normal", gaussian = "normal",
+        bernoulli = "bernoulli", ising = "bernoulli"
+    )[[tolower(name), exact = FALSE]]
+
+    ############################################################################
+    #                                                                          #
+    #    TODO: better (and possibly specialized) initial parameters!?!?!?!     #
+    #                                                                          #
+    ############################################################################
+
+    switch(name,
+        ########################################################################
+        ###                          Tensor Normal                           ###
+        ########################################################################
+        normal = {
+            initialize <- function(X, Fy) {
+                p <- head(dim(X), -1)
+                r <- length(p)
+
+                # Mode-Covariances
+                XSigmas <- mcov(X, sample.axis = r + 1L)
+                YSigmas <- mcov(Fy, sample.axis = r + 1L)
+
+                # Extract main mode covariance directions
+                # Note: (the directions are transposed!)
+                XDirs <- Map(function(Sigma) {
+                    SVD <- La.svd(Sigma, nu = 0)
+                    sqrt(SVD$d) * SVD$vt
+                }, XSigmas)
+                YDirs <- Map(function(Sigma) {
+                    SVD <- La.svd(Sigma, nu = 0)
+                    sqrt(SVD$d) * SVD$vt
+                }, YSigmas)
+
+                alphas <- Map(function(xdir, ydir) {
+                    s <- min(ncol(xdir), nrow(ydir))
+                    crossprod(xdir[seq_len(s), , drop = FALSE],
+                              ydir[seq_len(s), , drop = FALSE])
+                }, XDirs, YDirs)
+
+                list(
+                    eta1 = rowMeans(X, dims = r),
+                    alphas = alphas,
+                    Omegas = Map(diag, p)
+                )
+            }
+
+            # parameters of the tensor normal computed from the GLM parameters
+            params <- function(Fy, eta1, alphas, Omegas) {
+                Deltas <- Map(solve, Omegas)
+                mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
+                list(mu_y = mu_y, Deltas = Deltas)
+            }
+
+            # scaled negative log-likelihood
+            log.likelihood <- function(X, Fy, eta1, alphas, Omegas) {
+                n <- tail(dim(X), 1)        # sample size
+
+                # conditional mean
+                mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas))
+
+                # negative log-likelihood scaled by sample size
+                # Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)`
+                # migth fail, but `NAGD` has failsaves againt cases of "illegal"
+                # parameters.
+                suppressWarnings(
+                    0.5 * prod(p) * log(2 * pi) +
+                        sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) -
+                            (0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p)
+                )
+            }
+
+            # gradient of the scaled negative log-likelihood
+            grad <- function(X, Fy, eta1, alphas, Omegas) {
+                # retrieve dimensions
+                n <- tail(dim(X), 1)        # sample size
+                p <- head(dim(X), -1)       # predictor dimensions
+                r <- length(p)              # single predictor/response tensor order
+
+                ## "Inverse" Link: Tensor Normal Specific
+                # known exponential family constants
+                c1 <- 1
+                c2 <- -0.5
+
+                # Covariances from the GLM parameter Scatter matrices
+                Deltas <- Map(solve, Omegas)
+
+                # First moment via "inverse" link `g1(eta_y) = E[X | Y = y]`
+                E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
+
+                # Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]`
+                dim(E1) <- c(prod(p), n)
+                E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1))
+                ## end "Inverse" Link
+
+                dim(X) <- c(prod(p), n)
+
+                # Residuals
+                R <- X - E1
+                dim(R) <- c(p, n)
+
+                # mean deviation between the sample covariance to GLM estimated covariance
+                # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
+                S <- rowMeans(colKronecker(X, X)) - E2  # <- Optimized for Tensor Normal
+                dim(S) <- c(p, p)           # reshape to tensor or order `2 r`
+
+                # Gradients of the negative log-likelihood scaled by sample size
+                list(
+                    "Dl(eta1)" = -c1 * rowMeans(R, dims = r),
+                    "Dl(alphas)" = Map(function(j) {
+                        (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
+                    }, 1:r),
+                    "Dl(Omegas)" = Map(function(j) {
+                        deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
+                        # addapt to symmetric constraint for the derivative
+                        dim(deriv) <- c(p[j], p[j])
+                        deriv + t(deriv * (1 - diag(p[j])))
+                    }, 1:r)
+                )
+            }
+
+            # mean conditional Fisher Information
+            fisher.info <- function(Fy, eta1, alphas, Omegas) {
+                # retrieve dimensions
+                n <- tail(dim(Fy), 1)       # sample size
+                p <- dim(eta1)              # predictor dimensions
+                q <- head(dim(Fy), -1)      # response dimensions
+                r <- length(p)              # single predictor/response tensor order
+
+                # Hij = Cov(ti(X) %x% tj(X) | Y = y), i, j = 1, 2
+                H11 <- Reduce(`%x%`, rev(Map(solve, Omegas)))   # covariance
+                # 3rd central moment is zero
+                H12 <- H21 <- 0
+                # 4th moment by "Isserlis' theorem" a.k.a. "Wick's theorem"
+                H22 <- kronecker(H11, H11)
+                dim(H22) <- rep(prod(p), 4)
+                H22 <- H22 + aperm(H22, c(1, 3, 2, 4)) + aperm(H22, c(1, 3, 2, 4))
+
+                dim(H11) <- c(p, p)
+                dim(H22) <- c(p, p, p, p)
+
+                ## Fisher Information: Tensor Normal Specific
+                # known exponential family constants
+                c1 <- 1
+                c2 <- -0.5
+
+                # list of (Fy x_{k in [r]\j} alpha_j)
+                Gys <- Map(function(j) {
+                    mlm(Fy, alphas[-j], (1:r)[-j])
+                }, 1:r)
+
+                # setup indices of lower triangular matrix elements used for
+                # all tuples (j, l) with 1 <= j <= l <= r.
+                ltri <- lower.tri(diag(r), TRUE)
+                J <- .col(c(r, r))[ltri]
+                L <- .row(c(r, r))[ltri]
+
+                # inverse perfect outer shuffle of `2 r` elements
+                iShuf <- rep(1:r, each = 2) + c(0, r)
+
+                # Getter for the i'th observation from Gys[[j]]
+                # TODO: this is ugly, find a better (but still dynamic) version
+                GyGet <- function(i, j) {
+                    obs <- eval(str2lang(paste(
+                        "Gys[[j]][",
+                        paste(rep(",", r), collapse = ""),
+                        "i, drop = FALSE]"
+                    , collapse = "")))
+                    dim(obs) <- head(dim(obs), -1)
+                    obs
+                }
+
+                I11 <- c1^2 * mat(H11, 1:r)
+                I12 <- Map(function(j) {
+                    i11 <- ttt(Gys[[j]], H11, (1:r)[-j])
+                    # take mean with respect to observations (second mode)
+                    i11 <- colMeans(mat(i11, 2))
+                    dim(i11) <- c(q[j], p[j], p)
+                    t(mat(i11, 2:1))
+                }, 1:r)
+                I13 <- Map(matrix, 0, prod(p), p^2)
+                I22 <- Map(function(j, l) {
+                    i22 <- 0
+                    for (i in 1:n) {
+                        i22 <- i22 + ttt(
+                            ttt(GyGet(i, j), H11, (1:r)[-j]),
+                            GyGet(i, l), (1:r)[-l] + 2, (1:r)[-l])
+                    }
+                    (c1^2 / n) * mat(i22, 2:1)
+                }, J, L)
+                I23 <- Map(matrix, 0, (p * q)[J], (p^2)[L])
+
+                # shuffled H22
+                sH22 <- c2^2 * aperm(H22, c(iShuf, iShuf + 2 * r))
+                dim(sH22) <- c(p^2, p^2)
+                I33 <- Map(function(j, l) {
+                    # second derivative (without constraints)
+                    deriv <- mlm(mlm(sH22, Map(c, Omegas[-j]), (1:r)[-j],
+                                     transposed = TRUE),
+                                 Map(c, Omegas[-l]), (1:r)[-l] + r,
+                                 transposed = TRUE)
+                    dim(deriv) <- c(p[j], p[j], p[l], p[l])
+
+                    # enforce symmetry of `Omega_j`
+                    of.diag <- (slice.index(deriv, 1:2) != slice.index(deriv, 2:1))
+                    deriv <- deriv + of.diag * aperm(deriv, c(2, 1, 3, 4))
+
+                    # as well as the symmetry of `Omega_l`
+                    of.diag <- (slice.index(deriv, 3:4) != slice.index(deriv, 4:3))
+                    deriv <- deriv + of.diag * aperm(deriv, c(1, 2, 4, 3))
+
+                    # matricize and return
+                    dim(deriv) <- c(p[j]^2, p[l]^2)
+                    deriv
+                }, J, L)
+
+                names(I12) <- sprintf("I(eta1, alpha_%d)", 1:r)
+                names(I13) <- sprintf("I(eta1, Omega_%d)", 1:r)
+                names(I22) <- sprintf("I(alpha_%d, alpha_%d)", J, L)
+                names(I23) <- sprintf("I(alpha_%d, Omega_%d)", J, L)
+                names(I33) <- sprintf("I(Omega_%d, Omega_%d)", J, L)
+
+                list(I11 = I11, I12 = I12, I13 = I13, I22 = I22, I23 = I23, I33 = I33)
+            }
+
+            # Hessian of the scaled negative log-likelihood
+            hessian <- function(X, Fy, eta1, alphas, Omegas) {
+                stop("Not Implemented")
+            }
+        },
+
+        ########################################################################
+        ###                     Multi-Variate Bernoulli                      ###
+        ########################################################################
+        bernoulli = {
+            require(mvbernoulli)
+
+            initialize <- function(X, Fy) {
+                p <- head(dim(X), -1)
+                r <- length(p)
+
+                # Mode-Covariances
+                XSigmas <- mcov(X, sample.axis = r + 1L)
+                YSigmas <- mcov(Fy, sample.axis = r + 1L)
+
+                # Extract main mode covariance directions
+                # Note: (the directions are transposed!)
+                XDirs <- Map(function(Sigma) {
+                    SVD <- La.svd(Sigma, nu = 0)
+                    sqrt(SVD$d) * SVD$vt
+                }, XSigmas)
+                YDirs <- Map(function(Sigma) {
+                    SVD <- La.svd(Sigma, nu = 0)
+                    sqrt(SVD$d) * SVD$vt
+                }, YSigmas)
+
+                alphas <- Map(function(xdir, ydir) {
+                    s <- min(ncol(xdir), nrow(ydir))
+                    crossprod(xdir[seq_len(s), , drop = FALSE],
+                              ydir[seq_len(s), , drop = FALSE])
+                }, XDirs, YDirs)
+
+                list(
+                    eta1 = array(0, dim = p),
+                    alphas = alphas,
+                    Omegas = Map(diag, p)
+                )
+            }
+
+            params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
+                # number of observations
+                n <- tail(dim(Fy), 1)
+
+                # natural exponential family parameters
+                eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1))
+                eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas))
+
+                # # next the conditional Ising model parameters `theta_y`
+                # theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n)
+                # dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n)
+
+                # ltri <- which(lower.tri(eta_y2, diag = TRUE))
+                # diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
+                # theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1)
+                # theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ]
+
+                # conditional Ising model parameters
+                theta_y <- matrix(rep(vech(eta_y2), n), ncol = n)
+                ltri <- which(lower.tri(eta_y2, diag = TRUE))
+                diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
+                theta_y[diagonal, ] <- eta_y1
+
+                theta_y
+            }
+
+            # Scaled ngative log-likelihood
+            log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
+                # number of observations
+                n <- tail(dim(X), 1L)
+
+                # conditional Ising model parameters
+                theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
+
+                # convert to binary data set
+                storage.mode(X) <- "integer"
+                X.mvb <- as.mvbinary(mat(X, length(dim(X))))
+
+                # log-likelihood of the data set
+                -mean(sapply(seq_len(n), function(i) {
+                    ising_log_likelihood(theta_y[, i], X.mvb[i])
+                }))
+            }
+
+            # Gradient of the scaled negative log-likelihood
+            grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
+                # retrieve dimensions
+                n <- tail(dim(X), 1)        # sample size
+                p <- head(dim(X), -1)       # predictor dimensions
+                r <- length(p)              # single predictor/response tensor order
+
+                ## "Inverse" Link: Ising Model Specific
+                # conditional Ising model parameters: `p (p + 1) / 2` by `n`
+                theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
+
+                # conditional expectations
+                #   ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y]
+                E2 <- apply(theta_y, 2L, ising_marginal_probs)
+                #   convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y]
+                E2 <- E2[vech.pinv.index(prod(p)), ]
+                #   extract diagonal elements which are equal to E[vec(X) | Y = y]
+                E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ]
+                ## end "Inverse" Link
+
+                dim(X) <- c(prod(p), n)
+
+                # Residuals
+                R <- X - E1
+                dim(R) <- c(p, n)
+
+                # mean deviation between the sample covariance to GLM estimated covariance
+                # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
+                S <- rowMeans(colKronecker(X, X) - E2)
+                dim(S) <- c(p, p)           # reshape to tensor or order `2 r`
+
+                # Gradients of the negative log-likelihood scaled by sample size
+                list(
+                    "Dl(eta1)" = -c1 * rowMeans(R, dims = r),
+                    "Dl(alphas)" = Map(function(j) {
+                        (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
+                    }, 1:r),
+                    "Dl(Omegas)" = Map(function(j) {
+                        deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
+                        # addapt to symmetric constraint for the derivative
+                        dim(deriv) <- c(p[j], p[j])
+                        deriv + t(deriv * (1 - diag(p[j])))
+                    }, 1:r)
+                )
+            }
+
+            # Hessian of the scaled negative log-likelihood
+            hessian <- function(X, Fy, alphas, Omegas, c1 = 1, c2 = 1) {
+                stop("Not Implemented")
+            }
+
+            # Conditional Fisher Information
+            fisher.info <- function(Fy, alphas, Omegas) {
+                stop("Not Implemented")
+            }
+        }
+    )
+
+    list(
+        family = name,
+        initialize = initialize,
+        params = params,
+        # linkinv = linkinv,
+        log.likelihood = log.likelihood,
+        grad = grad,
+        hessian = hessian,
+        fisher.info = fisher.info
+    )
+}
--- a/tensorPredictors/R/matrixImage.R
+++ b/tensorPredictors/R/matrixImage.R
@ -15,35 +15,43 @@
 #'
 #' @export
 matrixImage <- function(A, add.values = FALSE,
-    main = NULL, sub = NULL, interpolate = FALSE, ...,
+    main = NULL, sub = NULL, interpolate = FALSE, ..., zlim = NA,
+    axes = TRUE, asp = 1, col = hcl.colors(24, "YlOrRd", rev = FALSE),
    digits = getOption("digits")
 ) {
    # plot raster image
-    plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "red",
-        xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main, sub = sub)
+    plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "black",
+        xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main, sub = sub,
+        asp = asp)

    # Scale values of `A` to [0, 1] with min mapped to 1 and max to 0.
-    S <- (max(A) - A) / diff(range(A))
+    if (missing(zlim)) {
+        S <- (max(A) - A) / diff(range(A))
+    } else {
+        S <- pmin(pmax(0, (zlim[2] - A) / diff(zlim)), 1)
+    }
+    # and not transform to color
+    S <- matrix(col[round((length(col) - 1) * S + 1)], nrow = nrow(A))

    # Raster Image ploting the matrix with element values mapped to grayscale
    # as big elements (original matrix A) are dark and small (negative) elements
    # are white.
    rasterImage(S, 0, 0, ncol(A), nrow(A), interpolate = interpolate, ...)

-    # Add X-axis giving index
+    # X/Y axes index (matches coordinates to matrix indices)
    x <- seq(1, ncol(A), by = 1)
-    axis(1, at = x - 0.5, labels = x, lwd = 0, lwd.ticks = 1)
-    # Add Y-axis
    y <- seq(1, nrow(A))
-    axis(2, at = y - 0.5, labels = rev(y), lwd = 0, lwd.ticks = 1, las = 1)
+    if (axes) {
+        axis(1, at = x - 0.5, labels = x, lwd = 0, lwd.ticks = 1)
+        axis(2, at = y - 0.5, labels = rev(y), lwd = 0, lwd.ticks = 1, las = 1)
+    }

-    # Writes matrix values (in colored element grids)
+    # Writes matrix values
    if (any(add.values)) {
        if (length(add.values) > 1) {
            A[!add.values] <- NA
            A[add.values] <- format(A[add.values], digits = digits)
        }
-        text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A,
-             adj = 0.5, col = as.integer(S > 0.65))
+        text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A, adj = 0.5)
    }
 }
--- a/tensorPredictors/R/num_deriv.R
+++ b/tensorPredictors/R/num_deriv.R
@ -3,34 +3,101 @@
 #' @example inst/examples/num_deriv.R
 #'
 #' @export
-num.deriv <- function(F, X, h = 1e-6, sym = FALSE) {
+num.deriv <- function(expr, ..., h = 1e-6, sym = FALSE) {
+    sexpr <- substitute(expr)
+    if (...length() != 1) {
+        stop("Expectd exactly one '...' variable")
+    }
+    var <- ...names()[1]
+    if (is.null(var)) {
+        arg <- substitute(...)
+        var <- if (is.symbol(arg)) as.character(arg) else "x"
+    }
+
+    if (is.language(sexpr) && !is.symbol(sexpr) && sexpr[[1]] == as.symbol("function")) {
+        func <- expr
+    } else {
+        if (is.name(sexpr)) {
+            expr <- call(as.character(sexpr), as.name(var))
+        } else {
+            if ((!is.call(sexpr) && !is.expression(sexpr))
+            || !(var %in% all.vars(sexpr))) {
+                stop("'expr' must be a function or expression containing '", var, "'")
+            }
+            expr <- sexpr
+        }
+
+        args <- as.pairlist(structure(list(alist(x = )[[1]]), names = var))
+        func <- as.function(c(args, expr), envir = parent.frame())
+    }
+
+    num.deriv.function(func, ..1, h = h, sym = sym)
+}
+
+#' @rdname num.deriv
+#' @export
+num.deriv.function <- function(FUN, X, h = 1e-6, sym = FALSE) {
    if (sym) {
        stopifnot(isSymmetric(X))
        p <- nrow(X)
        k <- seq_along(X) - 1
        mapply(function(i, j) {
            dx <- h * ((k == i * p + j) | (k == j * p + i))
-            (F(X + dx) - F(X - dx)) / (2 * h)
+            (FUN(X + dx) - FUN(X - dx)) / (2 * h)
        }, .row(dim(X)) - 1, .col(dim(X)) - 1)
    } else {
        sapply(seq_along(X), function(i) {
            dx <- h * (seq_along(X) == i)
-            (F(X + dx) - F(X - dx)) / (2 * h)
+            (FUN(X + dx) - FUN(X - dx)) / (2 * h)
        })
    }
 }

-#' Second numeric derivative
-#'
+#' @rdname num.deriv
 #' @export
-num.deriv2 <- function(F, X, Y, h = 1e-6, symX = FALSE, symY = FALSE) {
+num.deriv2 <- function(FUN, X, Y, h = 1e-6, symX = FALSE, symY = FALSE) {
    if (missing(Y)) {
-        num.deriv(function(x) {
-            num.deriv(function(z) F(z), x, h = h, sym = symX)
+        num.deriv.function(function(x) {
+            num.deriv.function(FUN, x, h = h, sym = symX)
        }, X, h = h, sym = symX)
    } else {
-        num.deriv(function(y) {
-            num.deriv(function(x) F(x, y), X, h = h, sym = symX)
+        num.deriv.function(function(y) {
+            num.deriv.function(function(x) FUN(x, y), X, h = h, sym = symX)
        }, Y, h = h, sym = symY)
    }
 }
+
+
+### Interface Idea / Demo
+# num.deriv2.function
+# num.deriv2 <- function(expr, var) {
+#     sexpr <- substitute(expr)
+#     svar <- substitute(var)
+#
+#     if (is.language(sexpr) && !is.symbol(sexpr) && sexpr[[1]] == as.symbol("function")) {
+#         func <- expr
+#     } else {
+#         if (is.name(sexpr)) {
+#             expr <- call(as.character(sexpr), as.name(svar))
+#         } else {
+#             if ((!is.call(sexpr) && !is.expression(sexpr))
+#             || !(as.character(svar) %in% all.vars(sexpr))) {
+#                 stop("'expr' must be a function or expression containing '",
+#                     as.character(svar), "'")
+#             }
+#             expr <- sexpr
+#         }
+#
+#         args <- as.pairlist(structure(list(alist(x = )[[1]]), names = as.character(svar)))
+#         func <- as.function(c(args, expr), envir = parent.frame())
+#     }
+#
+#     num.deriv2.function(func, var)
+# }
+# y <- c(pi, exp(1), (sqrt(5) + 1) / 2)
+# num.deriv2(function(x) x^2 + y, x)(1:3)
+# num.deriv2(x^2 + y, x)(1:3)
+# func <- function(x) x^2 + y
+# num.deriv2(func, x)(1:3)
+# func2 <- function(z = y, x) x^2 + z
+# num.deriv2(func2, x)(1:3)
--- a/tensorPredictors/inst/examples/num_deriv.R
+++ b/tensorPredictors/inst/examples/num_deriv.R
@ -4,15 +4,17 @@ A <- A + t(A)
 B <- matrix(rnorm(3 * 4), 3, 4)

 # F(A) = A B
-# DF(A) = B' x I
-stopifnot(all.equal(
-    num.deriv(function(X) X %*% B, A),
+# DF(A) = B' %x% I
+exprs.all.equal({
    t(B) %x% diag(nrow(A))
-))
+    num.deriv(function(X) X %*% B, A)
+    num.deriv(A %*% B, A)
+    num.deriv(X %*% B, X = A)
+})

 # Symmetric case, constraint A = A' (equiv to being a function of vech(A) only)
 # F(A) = A B     for A = A'
-# DF(A) = B' x I
+# DF(A) = B' %x% I
 stopifnot(all.equal(
    num.deriv(function(X) X %*% B, A, sym = TRUE),
    (t(B) %x% diag(nrow(A))) %*% D(nrow(A)) %*% t(D(nrow(A)))
--- a/tensorPredictors/tests/testthat.R
+++ b/tensorPredictors/tests/testthat.R
@ -0,0 +1,4 @@
+library(testthat)
+library(tensorPredictors)
+
+test_check("tensorPredictors")
--- a/tensorPredictors/tests/testthat/test-MatrixCalculus.R
+++ b/tensorPredictors/tests/testthat/test-MatrixCalculus.R
@ -0,0 +1,50 @@
+# F(X) = log(det(X))    if det(X) > 0
+# dF(X) = tr(X^-1 dX)
+# DF(X) = vec((X^-1)')'
+test_that("Matrix Calculus 1", {
+    # test data
+    X <- tcrossprod(diag(5) + matrix(runif(5^2, -.1, .1), 5))
+
+    num.grad <- num.deriv(function(X) log(det(X)), X)
+    ana.grad <- c(solve(X))
+
+    expect_equal(num.grad, ana.grad)
+})
+
+# F(mu) = <X - mu, (X - mu) x_{k in [r]} Delta_K^-1>    for Delta_k = Delta_k'
+# DF(mu) = -2 vec((X - mu) x_{k in [r]} Delta_K^-1)'
+test_that("Matrix Calculus 2", {
+    p <- c(2, 3, 5)
+
+    X <- array(rnorm(prod(p)), dim = p)
+    mu <- array(rnorm(prod(p)), dim = p)
+    Deltas <- Map(function(pj) tcrossprod(diag(pj) + runif(pj^2, -0.1, 0.1)), p)
+
+    ana.grad <- -2 * c(mlm(X - mu, Map(solve, Deltas)))
+    num.grad <- num.deriv(function(mu) sum((X - mu) * mlm(X - mu, Map(solve, Deltas))), mu)
+
+    expect_equal(num.grad, ana.grad)
+})
+
+# F(Delta_j) = <X - mu, (X - mu) x_{k in [r]} Delta_K^-1>    for Delta_k = Delta_k'
+# DF(Delta_j) = -((X - mu) x_{k in [r]} Delta_K^-1)_(j) (X - mu)_(j)' Delta_j^-1
+test_that("Matrix Calculus 3", {
+    # config
+    p <- c(2, 3, 5)
+
+    # generate test data
+    X <- array(rnorm(prod(p)), dim = p)
+    mu <- array(rnorm(prod(p)), dim = p)
+    Deltas <- Map(function(pj) tcrossprod(diag(pj) + runif(pj^2, -0.1, 0.1)), p)
+
+    # check analytic to numeric derivatives
+    for (j in seq_along(Deltas)) {
+        num.grad <- matrix(num.deriv(function(Delta_j) {
+                Deltas[[j]] <- Delta_j
+                sum((X - mu) * mlm(X - mu, Map(solve, Deltas)))
+            }, Deltas[[j]]), p[j])
+        ana.grad <- -mcrossprod(mlm(X - mu, Map(solve, Deltas)), X - mu, j) %*% solve(Deltas[[j]])
+
+        expect_equal(num.grad, ana.grad)
+    }
+})
--- a/tensorPredictors/tests/testthat/test-TensorNormalFamily.R
+++ b/tensorPredictors/tests/testthat/test-TensorNormalFamily.R
@ -0,0 +1,186 @@
+test_that("Tensor-Normal Score", {
+    # setup dimensions
+    n <- 11L
+    p <- c(3L, 5L, 4L)
+    q <- c(2L, 7L, 3L)
+
+    # create "true" GLM parameters
+    eta1 <- array(rnorm(prod(p)), dim = p)
+    alphas <- Map(matrix, Map(rnorm, p * q), p)
+    Omegas <- Map(function(pj) {
+        solve(0.5^abs(outer(seq_len(pj), seq_len(pj), `-`)))
+    }, p)
+    params <- list(eta1, alphas, Omegas)
+
+    # compute tensor normal parameters from GLM parameters
+    Deltas <- Map(solve, Omegas)
+    mu <- mlm(eta1, Deltas)
+
+    # sample some test data
+    sample.axis <- length(p) + 1L
+    Fy <- array(rnorm(n * prod(q)), dim = c(q, n))
+    X <- mlm(Fy, Map(`%*%`, Deltas, alphas)) + rtensornorm(n, mu, Deltas, sample.axis)
+
+    # Create a GLM family object
+    family <- make.gmlm.family("normal")
+
+    # first unpack the family object
+    log.likelihood <- family$log.likelihood
+    grad           <- family$grad
+
+    # compare numeric gradient against the analytic gradient provided by the
+    # family at the initial parameters
+    grad.num <- do.call(function(eta1, alphas, Omegas) {
+        list(
+            "Dl(eta1)" = num.deriv.function(function(eta1) {
+                log.likelihood(X, Fy, eta1, alphas, Omegas)
+            }, eta1),
+            "Dl(alphas)" = Map(function(j) num.deriv.function(function(alpha_j) {
+                alphas[[j]] <- alpha_j
+                log.likelihood(X, Fy, eta1, alphas, Omegas)
+            }, alphas[[j]]), seq_along(alphas)),
+            "Dl(Omegas)" = Map(function(j) num.deriv.function(function(Omega_j) {
+                Omegas[[j]] <- Omega_j
+                log.likelihood(X, Fy, eta1, alphas, Omegas)
+            }, Omegas[[j]], sym = TRUE), seq_along(Omegas))
+        )
+    }, params)
+    grad.ana <- do.call(grad, c(list(X, Fy), params))
+
+    expect_equal(RMap(c, grad.ana), grad.num, tolerance = 1e-6)
+
+    # test for correct dimensions
+    ana.dims <- list(
+        "Dl(eta1)" = p,
+        "Dl(alphas)" = Map(c, p, q),
+        "Dl(Omegas)" = Map(c, p, p)
+    )
+
+    expect_identical(RMap(dim, grad.ana), ana.dims)
+})
+
+test_that("Tensor-Normal moments of the sufficient statistic t(X)", {
+    # config
+    # # (estimated) sample size for sample estimates
+    # n <- 250000   # sample estimates are too unreliable for testing purposes
+    p <- c(2L, 3L)
+    r <- length(p)
+
+    # setup tensor normal parameters (NO dependence of Fy, set to zero)
+    mu <- array(runif(prod(p), min = -0.5, max = 0.5), dim = p)    # = mu_y
+    Deltas <- Map(function(pj) 0.5^abs(outer(1:pj, 1:pj, `-`)), p)
+
+    # compute GMLM parameters (NO alphas as Fy is zero)
+    Omegas <- Map(solve, Deltas)
+    eta1 <- mlm(mu, Omegas)
+
+    # natural parameters of the tensor normal
+    eta_y1 <- eta1 # + mlm(Fy, alphas) = + 0
+    eta_y2 <- -0.5 * Reduce(`%x%`, rev(Omegas))
+
+    # draw a sample
+    X <- rtensornorm(n, mu, Deltas, r + 1L)
+
+    # tensor normal log-partition function given eta_y
+    log.partition <- function(eta_y1, eta_y2) {
+        # eta_y1 as "model" matrix of dimensions `n by p` where `n` might be `1`
+        if (length(eta_y1)^2 == length(eta_y2)) {
+            pp <- length(eta_y1)
+            dim(eta_y1) <- c(1L, pp)
+        } else {
+            eta_y1 <- mat(eta_y1, r + 1L)
+            pp <- ncol(eta_y1)
+        }
+
+        # treat eta_y2 as square matrix
+        if (!is.matrix(eta_y2)) {
+            dim(eta_y2) <- c(pp, pp)
+        }
+
+        # evaluate log-partiton function in terms of natural parameters
+        -0.25 * pp * mean((eta_y1 %*% solve(eta_y2)) * eta_y1) - 0.5 * log(det(-2 * eta_y2))
+    }
+
+
+    # (analytic) first and second (un-centered) moment of the tensor normal
+    #   Eti = E[ti(X) | Y = y]
+    Et1.ana <- mu
+    Et2.ana <- Reduce(`%x%`, rev(Deltas)) + outer(c(mu), c(mu))
+
+    # (numeric) derivative of the log-partition function with respect to the natural
+    # parameters of the exponential family form of the tensor normal
+    Et1.num <- num.deriv(log.partition(eta_y1, eta_y2), eta_y1)
+    Et2.num <- num.deriv(log.partition(eta_y1, eta_y2), eta_y2)
+
+    # # (estimated) estimate from sample
+    # Et1.est <- rowMeans(X, dims = r)
+    # Et2.est <- colMeans(rowKronecker(mat(X, r + 1L), mat(X, r + 1L)))
+
+    # (analytic) convariance blocks of the sufficient statistic
+    #   Ctij = Cov(ti(X), tj(X) | Y = y)
+    Ct11.ana <- Reduce(`%x%`, rev(Deltas))
+    Ct12.ana <- local({
+        # Analytic solution / `vec(mu %x% Delta) + vec(mu' %x% Delta)`
+        ct12 <- 2 * c(mu) %x% Reduce(`%x%`, rev(Deltas))
+        # and symmetrize!
+        dim(ct12) <- rep(prod(p), 3)
+        (1 / 2) * (ct12 + aperm(ct12, c(3, 1, 2)))
+    })
+    Ct22.ana <- local({
+        Delta <- Reduce(`%x%`, rev(Deltas)) # Ct11.ana
+        ct22 <- (2 * Delta + 4 * outer(c(mu), c(mu))) %x% Delta
+
+        # TODO: What does this symmetrization do exactly? And why?!
+        dim(ct22) <- rep(prod(p), 4)
+        (1 / 4) * (
+            aperm(ct22, c(2, 3, 1, 4)) +
+            aperm(ct22, c(2, 3, 4, 1)) +
+            aperm(ct22, c(3, 2, 1, 4)) +
+            aperm(ct22, c(3, 2, 4, 1))
+        )
+    })
+
+    # (numeric) second derivative of the log-partition function
+    Ct11.num <- num.deriv2(function(eta_y1) log.partition(eta_y1, eta_y2), eta_y1)
+    Ct12.num <- num.deriv2(log.partition, eta_y1, eta_y2)
+    Ct22.num <- local({
+        ct22 <- num.deriv2(function(eta_y2) log.partition(eta_y1, eta_y2), eta_y2)
+        # TODO: Why does this need to be symmetrized?!
+        dim(ct22) <- rep(prod(p), 4)
+        (1 / 4) * (
+            aperm(ct22, c(3, 4, 2, 1)) +
+            aperm(ct22, c(4, 3, 2, 1)) +
+            aperm(ct22, c(3, 4, 1, 2)) +
+            aperm(ct22, c(4, 3, 1, 2))
+        )
+    })
+
+    # # (estimated) sample estimates of the sufficient statistic convariance blocks
+    # T1 <- mat(X, r + 1L)
+    # T2 <- rowKronecker(T1, T1)
+    # Ct11.est <- cov(T1)
+    # Ct12.est <- cov(T1, T2)
+    # Ct22.est <- cov(T2, T2)
+
+
+    tolerance <- 1e-3
+    expect_equal(c(Et1.ana), c(Et1.num), tolerance = tolerance)
+    # expect_equal(c(Et1.ana), c(Et1.est), tolerance = 0.05, scale = 1)
+    # expect_equal(c(Et1.num), c(Et1.est), tolerance = 0.05, scale = 1)
+
+    expect_equal(c(Et2.ana), c(Et2.num), tolerance = tolerance)
+    # expect_equal(c(Et2.ana), c(Et2.est), tolerance = 0.05, scale = 1)
+    # expect_equal(c(Et2.num), c(Et2.est), tolerance = 0.05, scale = 1)
+
+    expect_equal(c(Ct11.ana), c(Ct11.num), tolerance = tolerance)
+    # expect_equal(c(Ct11.ana), c(Ct11.est), tolerance = 0.05, scale = 1)
+    # expect_equal(c(Ct11.num), c(Ct11.est), tolerance = 0.05, scale = 1)
+
+    expect_equal(c(Ct12.ana), c(Ct12.num), tolerance = tolerance)
+    # expect_equal(c(Ct12.ana), c(Ct12.est), tolerance = 0.05, scale = 1)
+    # expect_equal(c(Ct12.num), c(Ct12.est), tolerance = 0.05, scale = 1)
+
+    expect_equal(c(Ct22.ana), c(Ct22.num), tolerance = tolerance)
+    # expect_equal(c(Ct22.ana), c(Ct22.est), tolerance = 0.05, scale = 1)
+    # expect_equal(c(Ct22.num), c(Ct22.est), tolerance = 0.05, scale = 1)
+})