test_that("Tensor-Normal Score", {
    # setup dimensions
    n <- 11L
    p <- c(3L, 5L, 4L)
    q <- c(2L, 7L, 3L)

    # create "true" GLM parameters
    eta1 <- array(rnorm(prod(p)), dim = p)
    alphas <- Map(matrix, Map(rnorm, p * q), p)
    Omegas <- Map(function(pj) {
        solve(0.5^abs(outer(seq_len(pj), seq_len(pj), `-`)))
    }, p)
    params <- list(eta1, alphas, Omegas)

    # compute tensor normal parameters from GLM parameters
    Deltas <- Map(solve, Omegas)
    mu <- mlm(eta1, Deltas)

    # sample some test data
    sample.axis <- length(p) + 1L
    Fy <- array(rnorm(n * prod(q)), dim = c(q, n))
    X <- mlm(Fy, Map(`%*%`, Deltas, alphas)) + rtensornorm(n, mu, Deltas, sample.axis)

    # Create a GLM family object
    family <- make.gmlm.family("normal")

    # first unpack the family object
    log.likelihood <- family$log.likelihood
    grad           <- family$grad

    # compare numeric gradient against the analytic gradient provided by the
    # family at the initial parameters
    grad.num <- do.call(function(eta1, alphas, Omegas) {
        list(
            "Dl(eta1)" = num.deriv.function(function(eta1) {
                log.likelihood(X, Fy, eta1, alphas, Omegas)
            }, eta1),
            "Dl(alphas)" = Map(function(j) num.deriv.function(function(alpha_j) {
                alphas[[j]] <- alpha_j
                log.likelihood(X, Fy, eta1, alphas, Omegas)
            }, alphas[[j]]), seq_along(alphas)),
            "Dl(Omegas)" = Map(function(j) num.deriv.function(function(Omega_j) {
                Omegas[[j]] <- Omega_j
                log.likelihood(X, Fy, eta1, alphas, Omegas)
            }, Omegas[[j]], sym = TRUE), seq_along(Omegas))
        )
    }, params)
    grad.ana <- do.call(grad, c(list(X, Fy), params))

    expect_equal(RMap(c, grad.ana), grad.num, tolerance = 1e-6)

    # test for correct dimensions
    ana.dims <- list(
        "Dl(eta1)" = p,
        "Dl(alphas)" = Map(c, p, q),
        "Dl(Omegas)" = Map(c, p, p)
    )

    expect_identical(RMap(dim, grad.ana), ana.dims)
})

test_that("Tensor-Normal moments of the sufficient statistic t(X)", {
    # config
    # # (estimated) sample size for sample estimates
    # n <- 250000   # sample estimates are too unreliable for testing purposes
    p <- c(2L, 3L)
    r <- length(p)

    # setup tensor normal parameters (NO dependence of Fy, set to zero)
    mu <- array(runif(prod(p), min = -0.5, max = 0.5), dim = p)    # = mu_y
    Deltas <- Map(function(pj) 0.5^abs(outer(1:pj, 1:pj, `-`)), p)

    # compute GMLM parameters (NO alphas as Fy is zero)
    Omegas <- Map(solve, Deltas)
    eta1 <- mlm(mu, Omegas)

    # natural parameters of the tensor normal
    eta_y1 <- eta1 # + mlm(Fy, alphas) = + 0
    eta_y2 <- -0.5 * Reduce(`%x%`, rev(Omegas))

    # # draw a sample
    # X <- rtensornorm(n, mu, Deltas, r + 1L)

    # tensor normal log-partition function given eta_y
    log.partition <- function(eta_y1, eta_y2) {
        # eta_y1 as "model" matrix of dimensions `n by p` where `n` might be `1`
        if (length(eta_y1)^2 == length(eta_y2)) {
            pp <- length(eta_y1)
            dim(eta_y1) <- c(1L, pp)
        } else {
            eta_y1 <- mat(eta_y1, r + 1L)
            pp <- ncol(eta_y1)
        }

        # treat eta_y2 as square matrix
        if (!is.matrix(eta_y2)) {
            dim(eta_y2) <- c(pp, pp)
        }

        # evaluate log-partiton function in terms of natural parameters
        -0.25 * pp * mean((eta_y1 %*% solve(eta_y2)) * eta_y1) - 0.5 * log(det(-2 * eta_y2))
    }


    # (analytic) first and second (un-centered) moment of the tensor normal
    #   Eti = E[ti(X) | Y = y]
    Et1.ana <- mu
    Et2.ana <- Reduce(`%x%`, rev(Deltas)) + outer(c(mu), c(mu))

    # (numeric) derivative of the log-partition function with respect to the natural
    # parameters of the exponential family form of the tensor normal
    Et1.num <- num.deriv(log.partition(eta_y1, eta_y2), eta_y1)
    Et2.num <- num.deriv(log.partition(eta_y1, eta_y2), eta_y2)

    # # (estimated) estimate from sample
    # Et1.est <- rowMeans(X, dims = r)
    # Et2.est <- colMeans(rowKronecker(mat(X, r + 1L), mat(X, r + 1L)))

    # (analytic) convariance blocks of the sufficient statistic
    #   Ctij = Cov(ti(X), tj(X) | Y = y) for i, j = 1, 2
    Ct11.ana <- Reduce(`%x%`, rev(Deltas))
    Ct12.ana <- local({
        # Analytic solution / `vec(mu %x% Delta) + vec(mu' %x% Delta)`
        ct12 <- c(mu) %x% Reduce(`%x%`, rev(Deltas))
        # and symmetrize!
        dim(ct12) <- rep(prod(p), 3)
        ct12 + aperm(ct12, c(3, 1, 2))
    })
    Ct22.ana <- local({
        Delta <- Reduce(`%x%`, rev(Deltas)) # Ct11.ana
        ct22 <- (2 * Delta + 4 * outer(c(mu), c(mu))) %x% Delta

        # TODO: What does this symmetrization do exactly? And why?!
        dim(ct22) <- rep(prod(p), 4)
        (1 / 4) * (
            aperm(ct22, c(2, 3, 1, 4)) +
            aperm(ct22, c(2, 3, 4, 1)) +
            aperm(ct22, c(3, 2, 1, 4)) +
            aperm(ct22, c(3, 2, 4, 1))
        )
    })

    # (numeric) second derivative of the log-partition function
    Ct11.num <- num.deriv2(function(eta_y1) log.partition(eta_y1, eta_y2), eta_y1)
    Ct12.num <- num.deriv2(log.partition, eta_y1, eta_y2)
    Ct22.num <- local({
        ct22 <- num.deriv2(function(eta_y2) log.partition(eta_y1, eta_y2), eta_y2)
        # TODO: Why does this need to be symmetrized?!
        dim(ct22) <- rep(prod(p), 4)
        (1 / 4) * (
            aperm(ct22, c(3, 4, 2, 1)) +
            aperm(ct22, c(4, 3, 2, 1)) +
            aperm(ct22, c(3, 4, 1, 2)) +
            aperm(ct22, c(4, 3, 1, 2))
        )
    })

    # # (estimated) sample estimates of the sufficient statistic convariance blocks
    # T1 <- mat(X, r + 1L)
    # T2 <- rowKronecker(T1, T1)
    # Ct11.est <- cov(T1)
    # Ct12.est <- cov(T1, T2)
    # Ct22.est <- cov(T2, T2)


    tolerance <- 1e-3
    expect_equal(c(Et1.ana), c(Et1.num), tolerance = tolerance)
    # expect_equal(c(Et1.ana), c(Et1.est), tolerance = 0.05, scale = 1)
    # expect_equal(c(Et1.num), c(Et1.est), tolerance = 0.05, scale = 1)

    expect_equal(c(Et2.ana), c(Et2.num), tolerance = tolerance)
    # expect_equal(c(Et2.ana), c(Et2.est), tolerance = 0.05, scale = 1)
    # expect_equal(c(Et2.num), c(Et2.est), tolerance = 0.05, scale = 1)

    expect_equal(c(Ct11.ana), c(Ct11.num), tolerance = tolerance)
    # expect_equal(c(Ct11.ana), c(Ct11.est), tolerance = 0.05, scale = 1)
    # expect_equal(c(Ct11.num), c(Ct11.est), tolerance = 0.05, scale = 1)

    expect_equal(c(Ct12.ana), c(Ct12.num), tolerance = tolerance)
    # expect_equal(c(Ct12.ana), c(Ct12.est), tolerance = 0.05, scale = 1)
    # expect_equal(c(Ct12.num), c(Ct12.est), tolerance = 0.05, scale = 1)

    expect_equal(c(Ct22.ana), c(Ct22.num), tolerance = tolerance)
    # expect_equal(c(Ct22.ana), c(Ct22.est), tolerance = 0.05, scale = 1)
    # expect_equal(c(Ct22.num), c(Ct22.est), tolerance = 0.05, scale = 1)
})