tensor_predictors/simulations/eeg_sim.R

library(tensorPredictors)
suppressPackageStartupMessages({
    library(ggplot2)
})

################################################################################
###                             Loading EEG Data                             ###
################################################################################

# Load as 3D predictors `X` and flat response `y`
c(X, y) %<-% local({
    # Load from file
    ds <- readRDS("eeg_data.rds")

    # Dimension values
    n <- nrow(ds)       # sample size (nr. of people)
    p <- 64L            # nr. of predictors (count of sensorce)
    t <- 256L           # nr. of time points (measurements)

    # Extract dimension names
    nNames <- ds$PersonID
    tNames <- as.character(seq(t))
    pNames <- unlist(strsplit(colnames(ds)[2 + t * seq(p)], "_"))[c(TRUE, FALSE)]

    # Split into predictors (with proper dims and names) and response
    X <- array(as.matrix(ds[, -(1:2)]),
        dim = c(person = n, time = t, sensor = p),
        dimnames = list(person = nNames, time = tNames, sensor = pNames)
    )
    y <- ds$Case_Control

    list(X, y)
})

################################################################################
###                       LOO-CV for Multiple Methods                        ###
################################################################################

# compatibility wrapper for function implemented with the "old" API
toNewAPI <- function(func) {
    function(...) {
        res <- func(...)
        list(alphas = list(res$beta, res$alpha))
    }
}

# Number of (2D)^2 PCA components per axis
npcs <- list(c(3, 4), c(15, 15), c(20, 30), dim(X)[-1])

# setup methods for simulation (with unified API)
methods <- list(
    hopca             = list(
        fun = function(X, Fy) list(alphas = hopca(X, npc = c(1L, 1L), 1L)),
        is.applicable = function(npc) all(npc == c(256L, 64L))  # NOT reduced
    ),
    kpir.base         = list(
        fun = toNewAPI(kpir.base),
        is.applicable = function(npc) prod(npc) < 100
    ),
    kpir.new.vlp      = list(
        fun = toNewAPI(function(X, Fy) kpir.new(X, Fy, init.method = "vlp")),
        is.applicable = function(npc) prod(npc) < 100
    ),
    kpir.new.ls       = list(
        fun = toNewAPI(function(X, Fy) kpir.new(X, Fy, init.method = "ls")),
        is.applicable = function(npc) prod(npc) < 100
    ),
    kpir.ls           = list(
        fun = kpir.ls,
        is.applicable = function(npc) TRUE
    ),
    LSIR              = list(
        fun = function(X, Fy) {
            res <- LSIR(matrix(X, nrow(X)), Fy, dim(X)[2], dim(X)[3])
            list(alphas = list(res$beta, res$alpha))
        },
        is.applicable = function(npc) prod(npc) < 1000
    ),
    kpir.momentum.vlp = list(
        fun = toNewAPI(function(X, Fy) kpir.momentum(X, Fy, init.method = "vlp")),
        is.applicable = function(npc) prod(npc) < 100
    ),
    kpir.momentum.ls  = list(
        fun = toNewAPI(function(X, Fy) kpir.momentum(X, Fy, init.method = "ls")),
        is.applicable = function(npc) prod(npc) < 100
    ),
    kpir.approx.vlp   = list(
        fun = toNewAPI(function(X, Fy) kpir.approx(X, Fy, init.method = "vlp")),
        is.applicable = function(npc) prod(npc) < 100
    ),
    kpir.approx.ls    = list(
        fun = toNewAPI(function(X, Fy) kpir.approx(X, Fy, init.method = "ls")),
        is.applicable = function(npc) TRUE
    )
)

# define AUC for reporting while simulation is running
auc <- function(y_true, y_pred) pROC::roc(y_true, y_pred, quiet = TRUE)$auc[1]

# file to dump simulation results
log.file <- format(Sys.time(), "eeg_sim_%Y%m%dT%H%M.rds")

# init complete simulation as empty
sim <- NULL
for (npc in npcs) {
    # check if any PC count is smaller than the axis
    if (any(npc < dim(X)[-1])) {
        # Reduce dimensions using (2D)^2 PCA, which is a special case of the Higher
        # Order Principal Component Analysis
        pcs <- hopca(X, npc = npc, sample.axis = 1)
        # Reduce dimensions
        X.pc <- mlm(X, Map(t, pcs), modes = 2:3)
    } else {
        # No reduction
        X.pc <- X
    }

    for (name in names(methods)) {
        # check if method can be applied to current reduction dimensions
        if (!methods[[name]]$is.applicable(npc)) {
            next
        }

        # extract method to be applied
        method <- methods[[name]]$fun

        # report name of current simulation method
        cat(sprintf("npc: (t = %d, p = %d), method: %s\n", npc[1], npc[2], name))

        # Leave-One-Out Cross-Validation
        loo.cv <- data.frame(
            y_true = y, y_pred = NA,                    # CV responses
            elapsed = NA, sys.self = NA, user.self = NA # execution time
        )
        for (i in seq_len(nrow(X.pc))) {
            # report progress
            cat(sprintf("\r%3d/%d", i, nrow(X.pc)))

            # Split into training/test data
            X.train <- X.pc[-i, , ]
            y.train <- scale(y[-i], scale = FALSE)
            X.test <- X.pc[i, , , drop = FALSE]
            y.test <- scale(y[i], center = attr(y.train, "scaled:center"), scale = FALSE)

            # fit reduction (with method one of the methods to be "validated")
            time <- system.time(sdr <- method(X.train, c(y.train)))

            # reduce training data and fit a GLM
            if ("Deltas" %in% names(sdr)) {
                # the small deltas are `delta_j = Delta_j^-1 alpha_j`
                deltas <- Map(solve, sdr$Deltas, sdr$alphas)
            } else {
                deltas <- sdr$alphas
            }
            x.train <- mlm(X.train, Map(t, deltas), modes = 2:3)
            fit <- glm(y ~ x, family = binomial(link = "logit"),
                data = data.frame(y = y[-i], x = matrix(x.train, nrow(x.train))))

            # predict from reduced test data
            x.test <- mlm(X.test, Map(t, deltas), modes = 2:3)
            y.pred <- predict(fit, data.frame(x = matrix(x.test, 1)), type = "response")

            loo.cv[i, "y_pred"] <- y.pred
            loo.cv[i, "elapsed"] <- time["elapsed"]
            loo.cv[i, "sys.self"] <- time["sys.self"]
            loo.cv[i, "user.self"] <- time["user.self"]
        }

        # accumulate LOO-CV results to previous results
        loo.cv$method <- factor(name)
        loo.cv$npc <- factor(sprintf("(%d, %d)", npc[1], npc[2]))
        sim <- rbind(sim, loo.cv)

        # Report partial sim done and one of the interesting measures
        cat(sprintf(" (Done) AUC: %f\n", with(loo.cv, auc(y_true, y_pred))))

        # dump simulation (after each fold) to file
        saveRDS(sim, log.file)
    }
}

################################################################################
###                             Simulation Stats                             ###
################################################################################
# sim <- readRDS("eeg_sim_<date-time>.rds")
# sim <- readRDS("eeg_sim_20220524T2100.rds")
# sim <- readRDS("eeg_sim_20220525T1700.rds")

metrics <- list(
    # acc: Accuracy. P(Yhat = Y). Estimated as: (TP+TN)/(P+N).
    "Acc" = function(y_true, y_pred) mean(round(y_pred) == y_true),
    # err: Error rate. P(Yhat != Y). Estimated as: (FP+FN)/(P+N).
    "Err" = function(y_true, y_pred) mean(round(y_pred) != y_true),
    # fpr: False positive rate. P(Yhat = + | Y = -). aliases: Fallout.
    "FPR" = function(y_true, y_pred) mean((round(y_pred) == 1)[y_true == 0]),
    # tpr: True positive rate.  P(Yhat = + | Y = +). aliases: Sensitivity, Recall.
    "TPR" = function(y_true, y_pred) mean((round(y_pred) == 1)[y_true == 1]),
    # fnr: False negative rate. P(Yhat = - | Y = +). aliases: Miss.
    "FNR" = function(y_true, y_pred) mean((round(y_pred) == 0)[y_true == 1]),
    # tnr: True negative rate.  P(Yhat = - | Y = -).
    "TNR" = function(y_true, y_pred) mean((round(y_pred) == 0)[y_true == 0]),
    # auc: Area Under the Curve
    "AUC" = function(y_true, y_pred) pROC::roc(y_true, y_pred, quiet = TRUE)$auc[1],
    # auc.sd: Estimated standard error of the AUC estimate
    "sd(AUC)" = function(y_true, y_pred)
        sqrt(pROC::var(pROC::roc(y_true, y_pred, quiet = TRUE)))
)

# Applies metrics on a group
do.stats <- function(group) {
    stat <- Map(do.call, metrics, list(as.list(group[c("y_true", "y_pred")])))
    data.frame(method = group$method[1], npc = group$npc[1], stat, check.names = FALSE)
}
# Call stats for each grouping
stats <- do.call(rbind, Map(do.stats, split(sim, ~ method + npc, sep = " ", drop = TRUE)))
rownames(stats) <- NULL

print(stats, digits = 2)

# and execution time stats
times <- aggregate(cbind(elapsed, sys.self, user.self) ~ method + npc, sim, median)

print(times, digits = 2)

## stats: 2022.05.24
#               method       npc  Acc  Err  FPR  TPR  FNR  TNR  AUC sd(AUC)
# 1          kpir.base    (3, 4) 0.70 0.30 0.60 0.87 0.13 0.40 0.75   0.047
# 2       kpir.new.vlp    (3, 4) 0.70 0.30 0.60 0.87 0.13 0.40 0.75   0.047
# 3        kpir.new.ls    (3, 4) 0.74 0.26 0.51 0.88 0.12 0.49 0.77   0.045
# 4            kpir.ls    (3, 4) 0.75 0.25 0.49 0.88 0.12 0.51 0.78   0.044
#          (*) kpir.ls    (3, 4) 0.78 0.22 0.38 0.87 0.13 0.62 0.86   0.034
# 5  kpir.momentum.vlp    (3, 4) 0.70 0.30 0.60 0.87 0.13 0.40 0.75   0.047
# 6   kpir.momentum.ls    (3, 4) 0.70 0.30 0.58 0.87 0.13 0.42 0.76   0.046
# 7    kpir.approx.vlp    (3, 4) 0.68 0.32 0.62 0.86 0.14 0.38 0.74   0.048
# 8     kpir.approx.ls    (3, 4) 0.73 0.27 0.53 0.88 0.12 0.47 0.78   0.044
# 9            kpir.ls  (15, 15) 0.75 0.25 0.47 0.87 0.13 0.53 0.78   0.044
#          (*) kpir.ls  (15, 15) 0.76 0.24 0.44 0.88 0.12 0.56 0.83   0.039
# 10    kpir.approx.ls  (15, 15) 0.73 0.27 0.51 0.87 0.13 0.49 0.78   0.044
# 11           kpir.ls  (20, 30) 0.75 0.25 0.47 0.87 0.13 0.53 0.78   0.044
#          (*) kpir.ls  (20, 30) 0.77 0.23 0.36 0.84 0.16 0.64 0.79   0.045
# 12    kpir.approx.ls  (20, 30) 0.63 0.37 1.00 1.00 0.00 0.00 0.51   0.053
# 13           kpir.ls (256, 64) 0.75 0.25 0.44 0.87 0.13 0.56 0.78   0.044
#          (*) kpir.ls (256, 64) 0.68 0.32 0.51 0.79 0.21 0.49 0.66   0.054
# 14    kpir.approx.ls (256, 64) 0.75 0.25 0.44 0.87 0.13 0.56 0.78   0.044
#
# (*) Using reduction matrices `Map(solve, sdr$Deltas, sdr$alphas)` instead
#     of only `sdr$alpha`.

## times: 2022.05.24
#               method       npc elapsed sys.self user.self
# 1          kpir.base    (3, 4)   0.079    0.402     0.220
# 2       kpir.new.vlp    (3, 4)   0.075    0.393     0.217
# 3        kpir.new.ls    (3, 4)   0.218    0.243     0.305
# 4            kpir.ls    (3, 4)   0.003    0.006     0.006
# 5  kpir.momentum.vlp    (3, 4)   0.143    0.595     0.359
# 6   kpir.momentum.ls    (3, 4)   0.297    0.252     0.385
# 7    kpir.approx.vlp    (3, 4)   0.044    0.240     0.152
# 8     kpir.approx.ls    (3, 4)   0.066    0.144     0.121
# 9            kpir.ls  (15, 15)   0.012    0.059     0.034
# 10    kpir.approx.ls  (15, 15)   0.813    3.911     2.325
# 11           kpir.ls  (20, 30)   0.028    0.129     0.080
# 12    kpir.approx.ls  (20, 30)   2.110   10.111     6.290
# 13           kpir.ls (256, 64)   1.252    6.215     3.681
# 14    kpir.approx.ls (256, 64)  36.754  141.028   147.490