Add efficiency simulation

2025-10-31 14:58:32 +01:00 · 2025-10-31 14:58:32 +01:00 · 3724166e8d
commit 3724166e8d
parent 491688378c
6 changed files with 233 additions and 55 deletions
--- a/LaTeX/main.bib
+++ b/LaTeX/main.bib
@ -2392,7 +2392,7 @@


@misc{lichess-database,
-    author = {Duplessis, Thibault},
+    author = {Thibault Duplessis},
    year   = {2013},
    title  = {lichess.org open database},
    url    = {https://database.lichess.org},
--- a/dataAnalysis/eeg/02_eeg_2d.R
+++ b/dataAnalysis/eeg/02_eeg_2d.R
@ -52,10 +52,10 @@ c(X, y) %<-% readRDS("eeg_data_2d.rds")
 #'
 #' @param X 3D EEG data (preprocessed or not)
 #' @param F binary responce `y` as a 3D tensor, every obs. is a 1 x 1 matrix
-loo.predict.gmlm <- function(X, y) {
+loo.predict.gmlm <- function(X, y, ...) {
    unlist(parallel::mclapply(seq_along(y), function(i) {
        # Fit with i'th observation removed
-        fit <- gmlm_tensor_normal(X[ , , -i], as.integer(y[-i]), sample.axis = 3L)
+        fit <- gmlm_tensor_normal(X[ , , -i], as.integer(y[-i]), sample.axis = 3L, ...)

        # Reduce the entire data set
        r <- as.vector(mlm(X, fit$betas, modes = 1:2, transpose = TRUE))
@ -75,26 +75,35 @@ loo.predict.gmlm <- function(X, y) {
    }, mc.cores = getOption("mc.cores", max(1L, parallel::detectCores() - 1L))))
 }

+proj.fft <- function(beta1, nr.freq = 5L) {
+    F <- fft(beta1)
+    Re(fft(`[<-`(F, head(order(abs(F)), -nr.freq), 0+0i), inverse = TRUE)) / length(F)
+}
+
 # perform preprocessed (reduced) and raw (not reduced) leave-one-out prediction
 y.hat.3.4   <- loo.predict.gmlm(preprocess(X,  3,  4), y)
 y.hat.15.15 <- loo.predict.gmlm(preprocess(X, 15, 15), y)
 y.hat.20.30 <- loo.predict.gmlm(preprocess(X, 20, 30), y)
 y.hat       <- loo.predict.gmlm(X, y)
+y.hat.fft   <- loo.predict.gmlm(X, y, proj.betas = list(proj.fft, NULL))

 # classification performance measures table by leave-one-out cross-validation
-(loo.cv <- apply(cbind(y.hat.3.4, y.hat.15.15, y.hat.20.30, y.hat), 2, function(y.pred) {
-    sapply(c("acc", "err", "fpr", "tpr", "fnr", "tnr", "auc", "auc.sd"),
-        function(FUN) { match.fun(FUN)(as.integer(y) - 1L, y.pred) })
-}))
-#>         y.hat.3.4 y.hat.15.15 y.hat.20.30      y.hat
-#> acc    0.79508197  0.78688525  0.78688525 0.78688525
-#> err    0.20491803  0.21311475  0.21311475 0.21311475
-#> fpr    0.35555556  0.40000000  0.40000000 0.40000000
-#> tpr    0.88311688  0.89610390  0.89610390 0.89610390
-#> fnr    0.11688312  0.10389610  0.10389610 0.10389610
-#> tnr    0.64444444  0.60000000  0.60000000 0.60000000
-#> auc    0.85108225  0.83838384  0.83924964 0.83896104
-#> auc.sd 0.03584791  0.03760531  0.03751307 0.03754553
+(loo.cv <- apply(
+    cbind(y.hat.3.4, y.hat.15.15, y.hat.20.30, y.hat, y.hat.fft), 2,
+    function(y.pred) {
+        sapply(c("acc", "err", "fpr", "tpr", "fnr", "tnr", "auc", "auc.sd"),
+            function(FUN) { match.fun(FUN)(as.integer(y) - 1L, y.pred) })
+    }
+))
+#>         y.hat.3.4 y.hat.15.15 y.hat.20.30      y.hat  y.hat.fft
+#> acc    0.79508197  0.78688525  0.78688525 0.78688525 0.81147541
+#> err    0.20491803  0.21311475  0.21311475 0.21311475 0.18852459
+#> fpr    0.35555556  0.40000000  0.40000000 0.40000000 0.33333333
+#> tpr    0.88311688  0.89610390  0.89610390 0.89610390 0.89610390
+#> fnr    0.11688312  0.10389610  0.10389610 0.10389610 0.10389610
+#> tnr    0.64444444  0.60000000  0.60000000 0.60000000 0.66666667
+#> auc    0.85194805  0.83838384  0.83924964 0.83896104 0.84646465
+#> auc.sd 0.03574475  0.03760531  0.03751754 0.03754553 0.03751864


 ################################## Tensor SIR ##################################
--- a/dataAnalysis/eeg/03_eeg_3d.R
+++ b/dataAnalysis/eeg/03_eeg_3d.R
@ -1,6 +1,8 @@
 library(tensorPredictors)
-library(parallel)
-library(pROC)
+suppressPackageStartupMessages({
+    library(parallel)
+    library(pROC)
+})

 #' Mode-Wise PCA preprocessing (generalized (2D)^2 PCA)
 #'
@ -43,28 +45,6 @@ auc.sd <- function(y.true, y.pred) {
 }


-# # unified API for all reduction procedures
-# GMLM <- list(
-#     fit = function(X, y) tensorPredictors::gmlm_tensor_normal(X, as.integer(y), sample.axis = 4L),
-#     reduce = function(X, fit) mlm(X, fit$betas, 1:3, TRUE),
-#     applicable = function(X) TRUE
-# )
-# TSIR <- list(
-#     fit = function(X, y) tensorPredictors::TSIR(X, y, c(1L, 1L, 1L), sample.axis = 4L),
-#     reduce = function(X, fit) mlm(X, fit, 1:3, TRUE),
-#     applicable = function(X) TRUE
-# )
-# KPIR_LS <- list(
-#     fit = function(X, y) {
-#         if (any(dim(X)[-4] > dim(X)[4])) {
-#             stop("Dimensions too big")
-#         }
-#         tensorPredictors::kpir.ls(X, as.integer(y), sample.axis = 4L)
-#     },
-#     reduce = function(X, fit) if (is.null(fit)) NA else mlm(X, fit$alphas, 1:3, TRUE),
-#     applicable = function(X) all(dim(X)[1:3] <= dim(X)[4])
-# )
-
 #' Leave-one-out prediction using TSIR
 #'
 #' @param method reduction method to be applied
@ -98,6 +78,12 @@ loo.predict <- function(method, X, y, ...) {
    }, mc.cores = getOption("mc.cores", max(1L, parallel::detectCores() - 1L))))
 }

+# "Projects" a sequence to its first `nr.freq` frequency components
+proj.fft <- function(sequence, nr.freq = 5L) {
+    F <- fft(sequence)
+    Re(fft(`[<-`(F, head(order(abs(F)), -nr.freq), 0+0i), inverse = TRUE)) / length(F)
+}
+

 # Load full EEG dataset (3D tensor for each subject)
 c(X, y) %<-% readRDS("eeg_data_3d.rds")
@ -110,21 +96,22 @@ y.hat.3.4   <- loo.predict(gmlm_tensor_normal, preprocess(X,  3,  4, 3), y)
 y.hat.15.15 <- loo.predict(gmlm_tensor_normal, preprocess(X, 15, 15, 3), y)
 y.hat.20.30 <- loo.predict(gmlm_tensor_normal, preprocess(X, 20, 30, 3), y)
 y.hat       <- loo.predict(gmlm_tensor_normal, X, y)
+y.hat.fft   <- loo.predict(gmlm_tensor_normal, X, y, proj.betas = list(proj.fft, NULL, NULL))

 # classification performance measures table by leave-one-out cross-validation
-(loo.cv <- apply(cbind(y.hat.3.4, y.hat.15.15, y.hat.20.30, y.hat), 2, function(y.pred) {
+(loo.cv <- apply(cbind(y.hat.3.4, y.hat.15.15, y.hat.20.30, y.hat, y.hat.fft), 2, function(y.pred) {
    sapply(c("acc", "err", "fpr", "tpr", "fnr", "tnr", "auc", "auc.sd"),
        function(FUN) { match.fun(FUN)(as.integer(y) - 1L, y.pred) })
 }))
-#>         y.hat.3.4 y.hat.15.15 y.hat.20.30      y.hat
-#> acc    0.83606557  0.80327869  0.80327869 0.79508197
-#> err    0.16393443  0.19672131  0.19672131 0.20491803
-#> fpr    0.31111111  0.33333333  0.33333333 0.35555556
-#> tpr    0.92207792  0.88311688  0.88311688 0.88311688
-#> fnr    0.07792208  0.11688312  0.11688312 0.11688312
-#> tnr    0.68888889  0.66666667  0.66666667 0.64444444
-#> auc    0.88051948  0.86984127  0.86926407 0.86810967
-#> auc.sd 0.03118211  0.03254642  0.03259186 0.03295883
+#>         y.hat.3.4 y.hat.15.15 y.hat.20.30      y.hat  y.hat.fft
+#> acc    0.83606557  0.80327869  0.80327869 0.79508197 0.79508197
+#> err    0.16393443  0.19672131  0.19672131 0.20491803 0.20491803
+#> fpr    0.31111111  0.33333333  0.33333333 0.35555556 0.33333333
+#> tpr    0.92207792  0.88311688  0.88311688 0.88311688 0.87012987
+#> fnr    0.07792208  0.11688312  0.11688312 0.11688312 0.12987013
+#> tnr    0.68888889  0.66666667  0.66666667 0.64444444 0.66666667
+#> auc    0.88051948  0.86984127  0.86926407 0.86810967 0.86810967
+#> auc.sd 0.03118211  0.03254642  0.03259186 0.03295883 0.03354029


 ################################## Tensor SIR ##################################
--- a/sim/sim_efficiency.R
+++ b/sim/sim_efficiency.R
@ -0,0 +1,182 @@
+library(tensorPredictors)
+
+
+# Set PRNG seed to the first 4 digits of the golden ratio for reproducability
+set.seed(1618L, "Mersenne-Twister", "Inversion", "Rejection")
+
+### Simulation configuration
+reps <- 100                     # number of simulation replications
+sample.sizes <- c(100, 200, 300, 500, 750)  # sample sizes `n`
+
+# Parameterize the "true" reductions on the dimension
+gen.beta <- function(pj) {
+    as.matrix((-1)^seq_len(pj))
+}
+# the precision matrices are simply diag(pj)
+
+
+# sampling from the conditional matrix normal `X | Y = y ~ N(mu(y), I_{p1 p2})`
+sample.data <- function(sample.size, betas, Omegas, eta1 = 0) {
+    # responce is a standard normal variable
+    y <- rnorm(sample.size)
+    # F(y) is identical to y, except its a tensor (last axis is sample axis)
+    F <- array(y, dim = c(mapply(ncol, betas), sample.size))
+
+    # sample predictors from tensor normal X | Y = y (last axis is sample axis)
+    sample.axis <- length(betas) + 1L
+    Deltas <- Map(solve, Omegas)                            # normal covariances
+    mu_y <- mlm(mlm(F, betas) + as.vector(eta1), Deltas)    # conditional mean
+    X <- mu_y + rtensornorm(sample.size, 0, Deltas, sample.axis)           # responses X
+
+    list(X = X, F = F, y = y, sample.axis = sample.axis)
+}
+
+
+# Open simulation CSV log file
+log.name <- format(Sys.time(), "sim_efficiency-%Y%m%dT%H%M.csv")
+log.file <- file(log.name, "w")
+# Counts new number of writes purely here to write the CSV header the first time
+log.writes <- 0L
+
+# Setting p1 = p2 = pj  (note, in the paper `p = p1 p2`)
+mode.dims <- round(1.2^unique(round(logb(2:200, 1.2))))
+for (pj in mode.dims) {
+
+    betas.true <- list(gen.beta(pj), gen.beta(pj))
+    B.true <- kronecker(betas.true[[2]], betas.true[[1]])
+    Omegas.true <- list(diag(pj), diag(pj))
+
+    for (sample.size in sample.sizes) {
+
+        sim <- sapply(seq_len(reps), function(.) {
+            c(X, F, y, sample.axis) %<-% sample.data(sample.size, betas.true, Omegas.true)
+
+            ds.lm <- tryCatch({
+                unname(lm.fit(t(`dim<-`(X, c(pj^2, sample.size))), drop(F))$coefficients)
+                dist.subspace(B.true, B.lm, normalize = TRUE)
+            }, error = function(.) NA)
+
+            c(., betas.vec, Omegas.vec) %<-% gmlm_tensor_normal(`dim<-`(X, c(pj^2, sample.size)), drop(F))
+
+            c(., betas.gmlm, Omegas.gmlm) %<-% gmlm_tensor_normal(X, F)
+
+            c(., betas.mani, Omegas.mani) %<-% gmlm_tensor_normal(X, F,
+                proj.Omegas = rep(list(function(O) { diag(mean(diag(O)), nrow(O)) }), 2)
+            )
+
+            
+            ds.vec  <- dist.subspace(B.true, betas.vec[[1]], normalize = TRUE)
+            ds.gmlm <- dist.subspace(betas.true, betas.gmlm, normalize = TRUE)   # equiv to R> dist.subspace(B.true, B.gmlm)
+            ds.mani <- dist.subspace(betas.true, betas.mani, normalize = TRUE)
+
+            c(lm = ds.lm, vec = ds.vec, gmlm = ds.gmlm, mani = ds.mani)
+        })
+
+        sim <- as.data.frame(t(sim))
+        sim$sample.size <- sample.size
+        sim$pj <- pj
+
+        # boxplot(t(sim))
+        # summary(t(sim))
+
+        # Append current simulation results to log-file
+        write.table(sim, file = log.file, sep = ",",
+            row.names = FALSE, col.names = (log.writes <- log.writes + 1L) < 2L
+        )
+
+        # print progress
+        cat(sprintf("mode dim (%d): %d/%d - sample size (%d): %d/%d\n",
+            pj, which(pj == mode.dims), length(mode.dims),
+            sample.size, which(sample.size == sample.sizes), length(sample.sizes)
+        ))
+    }
+}
+close(log.file)
+
+
+# Read simulation data back in
+sim <- read.csv(log.name)
+
+# with(aggregate(sim, . ~ sample.size + pj, mean), {
+#     plot(range(pj), range(c(vec, gmlm, mani)), type = "n",
+#         main = "Simulation -- Efficiency Gain",
+#         xlab = expression(tilde(p)),
+#         ylab = expression(d(B, hat(B)))
+#     )
+#     for (sz in sort(unique(sample.size))) {
+#         i <- order(pj)[sample.size == sz]
+#         lines(pj[i], vec[i], type = "b", pch = 16, col = sz %/% 100, lty = 1)
+#         lines(pj[i], gmlm[i], type = "b", pch = 16, col = sz %/% 100, lty = 2)
+#         lines(pj[i], mani[i], type = "b", pch = 16, col = sz %/% 100, lty = 3)
+#     }
+#     sd <- aggregate(sim, . ~ sample.size + pj, sd)
+# })
+
+with(merge(
+    aggregate(sim[names(sim) != "lm"], . ~ sample.size + pj, mean),
+    aggregate(sim[names(sim) != "lm"], . ~ sample.size + pj, sd),
+    by = c("sample.size", "pj"),
+    suffixes = c("", ".sd"),
+    all = FALSE
+), {
+    plot(range(pj), range(c(vec, gmlm, mani)), type = "n",
+        main = "Simulation -- Efficiency Gain",
+        xlab = expression(tilde(p)),
+        ylab = expression(d(B, hat(B)))
+    )
+    # colors <- c("#cf7d03ff", "#002d8d", "#006e18")
+    # col.idx <- 0L
+    lty.idx <- 0L
+    for (sz in sort(unique(sample.size))) {
+        i <- order(pj)[(sample.size == sz)[order(pj)]]
+        # polygon(c(pj[i], rev(pj[i])), c(lm[i] + lm.sd[i], rev(lm[i] - lm.sd[i])),
+        #     col = paste0("#cf7d03", "50"), border = NA
+        # )
+        polygon(c(pj[i], rev(pj[i])), c(vec[i] + vec.sd[i], rev(vec[i] - vec.sd[i])),
+            col = paste0("#b30303", "50"), border = NA
+        )
+        polygon(c(pj[i], rev(pj[i])), c(gmlm[i] + gmlm.sd[i], rev(gmlm[i] - gmlm.sd[i])),
+            col = paste0("#002d8d", "50"), border = NA
+        )
+        polygon(c(pj[i], rev(pj[i])), c(mani[i] + mani.sd[i], rev(mani[i] - mani.sd[i])),
+            col = paste0("#006e18", "50"), border = NA
+        )
+    }
+    lty.idx <- 1L
+    for (sz in sort(unique(sample.size))) {
+        i <- order(pj)[(sample.size == sz)[order(pj)]]
+        # lines(pj[i], lm[i],   type = "b", pch = 16, col = "#cf7d03", lty = lty.idx, lwd = 2)
+        lines(pj[i], vec[i],  type = "b", pch = 16, col = "#b30303", lty = lty.idx, lwd = 2)
+        lines(pj[i], gmlm[i], type = "b", pch = 16, col = "#002d8d", lty = lty.idx, lwd = 2)
+        lines(pj[i], mani[i], type = "b", pch = 16, col = "#006e18", lty = lty.idx, lwd = 2)
+        lty.idx <- lty.idx + 1L
+    }
+})
+
+
+
+# unname(lm.fit(t(`dim<-`(X, c(pj^2, sample.size))), drop(F))$coefficients)
+# unname(lm(drop(F) ~ t(`dim<-`(X, c(pj^2, sample.size))) - 1)$coefficients)
+
+
+
+
+# require(utils)
+
+# set.seed(129)
+
+# n <- 7 ; p <- 2
+# X <- matrix(rnorm(n * p), n, p) # no intercept!
+# y <- rnorm(n)
+# w <- rnorm(n)^2
+
+# str(lmw <- lm.wfit(x = X, y = y, w = w))
+
+# str(lm. <- lm.fit (x = X, y = y))
+
+
+# if(require("microbenchmark")) {
+#     mb <- microbenchmark(lm(y~X), lm.fit(X,y), .lm.fit(X,y))
+#     print(mb)
+#     boxplot(mb, notch=TRUE)
+# }
--- a/tensorPredictors/R/gmlm_tensor_normal.R
+++ b/tensorPredictors/R/gmlm_tensor_normal.R
@ -114,9 +114,9 @@ gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)),
                    Sigmas[[j]] <- Sigmas[[j]] + diag(0.2 * min_max[2], nrow(Sigmas[[j]]))
                }
            }
-            # Compute (unconstraint but regularized) Omega_j as covariance inverse
+            # Compute (unconstraint) Omega_j's as covariance inverse
            Omegas[[j]] <- solve(Sigmas[[j]])
-            # Project Omega_j to the Omega_j's manifold
+            # Project Omega_j's to their manifolds
            if (is.function(proj_j <- proj.Omegas[[j]])) {
                Omegas[[j]] <- proj_j(Omegas[[j]])
                # Reverse computation of `Sigma_j` as inverse of `Omega_j`
--- a/tensorPredictors/R/kpir_ls.R
+++ b/tensorPredictors/R/kpir_ls.R
@ -44,14 +44,14 @@ kpir.ls <- function(X, Fy, max.iter = 20L, sample.axis = 1L,
    }, modes, dim(Fy)[modes])


-   ### Step 2: iterate per mode (axis) least squares estimates
+    ### Step 2: iterate per mode (axis) least squares estimates
    for (iter in seq_len(max.iter)) {
- 
+
        # Invoke logger for previous iterate
        if (is.function(logger)) {
            logger("ls", iter - 1L, alphas)
        }
- 
+
        # cyclic iterate over modes
        for (j in seq_along(modes)) {
            # least squares solution for `alpha_j | alpha_i, i != j`