wip: 2d and 2d EEG

This commit is contained in:
Daniel Kapla 2025-04-01 16:56:57 +02:00
parent e910db0377
commit 13f85bdc16
4 changed files with 388 additions and 0 deletions

View File

@ -0,0 +1,148 @@
# List of all sensor names/positions (in chanel order) to ensure all entries
# are places at the same sensor x time position in the matrix layout
sensors <- c(
"FP1", "FP2", "F7", "F8", "AF1", "AF2", "FZ", "F4", "F3", "FC6", "FC5",
"FC2", "FC1", "T8", "T7", "CZ", "C3", "C4", "CP5", "CP6", "CP1", "CP2",
"P3", "P4", "PZ", "P8", "P7", "PO2", "PO1", "O2", "O1", "X", "AF7",
"AF8", "F5", "F6", "FT7", "FT8", "FPZ", "FC4", "FC3", "C6", "C5", "F2",
"F1", "TP8", "TP7", "AFZ", "CP3", "CP4", "P5", "P6", "C1", "C2", "PO7",
"PO8", "FCZ", "POZ", "OZ", "P2", "P1", "CPZ", "nd", "Y"
)
tmpdir <- tempdir()
untar("eeg_full.tar", exdir = tmpdir) # uncompress
subjects <- untar("eeg_full.tar", list = TRUE) # file names (only read)
subjects <- `names<-`(vector("list", length(subjects)), substr(subjects, 1, 11))
for (i in seq_along(subjects)) {
subject <- names(subjects)[i]
# Decompressed folder of trials for current subject
untar(file.path(tmpdir, sprintf("%s.tar.gz", subject)), exdir = tmpdir)
# Iterate all trial files of current subject
X <- lapply(list.files(file.path(tmpdir, subject), full.names = TRUE),
function(trial) {
# Read leading meta data lines and data from gz compressed CSV file
conn <- gzfile(trial)
meta <- readLines(conn, 4)
# If there are less than 4 entries in meta, the file does NOT contain
# any data -> error, ignore trial
if (length(meta) < 4) {
return(NULL)
}
# Read data (measurements)
data <- read.csv(conn,
header = FALSE, sep = " ", comment.char = "#",
col.names = c("trial", "sensor", "time", "volts"),
colClasses = c("integer", "character", "integer", "numeric")
)
# Compute index of every entry (ensures same
# placement of measurements in its matrix representation)
data$sensor <- factor(data$sensor, levels = sensors)
idx <- (as.integer(data$sensor) - 1L) * 256L + data$time + 1L
# Check if every sensor at every time point is present to ensure
# no interleaving or shifted data in final 3D array
if ((length(idx) != 64 * 256) || any(sort(idx) != seq_along(idx))) {
return(NULL)
}
# Return measurements in standardized order and attach meta info
structure(data$volts[idx], meta = meta[4])
}
)
# Count nr. of errors (return NULL) and remove from data
file_error_idx <- which(sapply(X, is.null))
if (length(file_error_idx)) {
X[file_error_idx] <- NULL
}
# Extract meta information
meta <- sapply(X, attr, "meta")
# Check for error notification in meta data and drop them as well
notice_error_idx <- grep(".*err.*", meta)
if (length(notice_error_idx)) {
X[notice_error_idx] <- NULL
meta <- meta[-notice_error_idx]
}
# Split into trial condition and trial nr.
condition <- factor(
sub(".*(S1 obj|S2 match|S2 nomatch).*", meta, replacement = "\\1"),
levels = c("S1 obj", "S2 match", "S2 nomatch")
)
# Concatinate individual trials in a 3D array (based on standardized order)
X <- matrix(unlist(X), 256 * 64)
# Track for reporting the nr. of non-finite values
nr_non_finite <- sum(!is.finite(X))
# Compute mean over trials grouped by trial condition
X <- c(
rowMeans(X[, condition == "S1 obj", drop = FALSE], na.rm = TRUE),
rowMeans(X[, condition == "S2 match", drop = FALSE], na.rm = TRUE),
rowMeans(X[, condition == "S2 nomatch", drop = FALSE], na.rm = TRUE)
)
# store mean of trials of current subject in list of subjects
subjects[[subject]] <- X
# Remove/Delete subject file and decompressed folder
unlink(file.path(tmpdir, subject), recursive = TRUE)
unlink(file.path(tmpdir, sprintf("%s.tar.gz", subject)))
# Report progress
cat(sprintf(
"%5d/%d - Nr. trials: %3d = %3d + %3d + %3d%s%s\n",
i, length(subjects),
length(condition),
sum(condition == "S1 obj"),
sum(condition == "S2 match"),
sum(condition == "S2 nomatch"),
if (nr_non_finite) sprintf(", Nr. non-finite: %d", nr_non_finite) else "",
if (length(file_error_idx) && length(notice_error_idx)) {
sprintf(", %d file and %d notice errors (%s) -> trials dropped",
length(file_error_idx), length(notice_error_idx), subject
)
} else if (length(file_error_idx)) {
sprintf(", %d file errors (%s) -> trials dropped",
length(file_error_idx), subject
)
} else if (length(notice_error_idx)) {
sprintf(", %d notice errors (%s) -> trials dropped",
length(notice_error_idx), subject
)
} else {
""
}
))
}
# Combine subjects in single 4D tensor (time x sensor x condition x subject)
X <- array(
unlist(subjects),
dim = c(time = 256, sensor = 64, condition = 3, subject = length(subjects)),
dimnames = list(
time = 1:256,
sensor = sensors,
condition = c("S1 obj", "S2 match", "S2 nomatch"),
subject = names(subjects)
)
)
# Extract alcoholic or control labels for each subject
y <- factor(
substr(names(subjects), 4, 4),
levels = c("c", "a"),
labels = c("control", "alcoholic")
)
names(y) <- names(subjects)
# Save full processed EEG dataset as R data file
saveRDS(list(X = X, y = y), file = "eeg_data_3d.rds")
saveRDS(list(X = X[, , "S1 obj", ], y = y), file = "eeg_data_2d.rds")

View File

@ -0,0 +1,149 @@
library(tensorPredictors)
library(parallel)
library(pROC)
#' Mode-Wise PCA preprocessing
#'
#' @param npc_time Number of Principal Components for time axis
#' @param npc_sensor Number of Principal Components for sensor axis
#' @param npc_condition Number of Principal Components for stimulus condition axis
preprocess <- function(X, npc_time, npc_sensor, npc_condition) {
# Mode covariances (for predictor and time point modes)
c(Sigma_t, Sigma_s, Sigma_c) %<-% mcov(X)
# "predictor" (sensor) and time point principal components
V_t <- svd(Sigma_t, npc_time, 0L)$u
V_s <- svd(Sigma_s, npc_sensor, 0L)$u
V_c <- svd(Sigma_c, npc_condition, 0L)$u
# reduce with mode wise PCs
mlm(X, list(V_t, V_s, V_c), modes = 1:3, transposed = TRUE)
}
### Classification performance measures
# acc: Accuracy. P(Yhat = Y). Estimated as: (TP+TN)/(P+N).
acc <- function(y.true, y.pred) mean(round(y.pred) == y.true)
# err: Error rate. P(Yhat != Y). Estimated as: (FP+FN)/(P+N).
err <- function(y.true, y.pred) mean(round(y.pred) != y.true)
# fpr: False positive rate. P(Yhat = + | Y = -). aliases: Fallout.
fpr <- function(y.true, y.pred) mean((round(y.pred) == 1)[y.true == 0])
# tpr: True positive rate. P(Yhat = + | Y = +). aliases: Sensitivity, Recall.
tpr <- function(y.true, y.pred) mean((round(y.pred) == 1)[y.true == 1])
# fnr: False negative rate. P(Yhat = - | Y = +). aliases: Miss.
fnr <- function(y.true, y.pred) mean((round(y.pred) == 0)[y.true == 1])
# tnr: True negative rate. P(Yhat = - | Y = -).
tnr <- function(y.true, y.pred) mean((round(y.pred) == 0)[y.true == 0])
# auc: Area Under the Curve
auc <- function(y.true, y.pred) as.numeric(pROC::roc(y.true, y.pred, quiet = TRUE)$auc)
auc.sd <- function(y.true, y.pred) sqrt(pROC::var(pROC::roc(y.true, y.pred, quiet = TRUE)))
# Load full EEG dataset of all subjects
c(X, y) %<-% readRDS("eeg_data_3d.rds")
##################################### GMLM #####################################
#' Leave-one-out prediction using GMLM
#'
#' @param X 3D EEG data (preprocessed or not)
#' @param y binary responce `y` as a 3D tensor, every obs. is a 1 x 1 matrix
loo.predict.gmlm <- function(X, y) {
unlist(parallel::mclapply(seq_along(y), function(i) {
# Fit with i'th observation removed
fit <- gmlm_tensor_normal(X[ , , , -i], as.integer(y[-i]), sample.axis = 4L)
# Reduce the entire data set
r <- as.vector(mlm(X, fit$betas, modes = 1:3, transpose = TRUE))
# Fit a logit model on reduced data with i'th observation removed
logit <- glm(y ~ r, family = binomial(link = "logit"),
data = data.frame(y = y[-i], r = r[-i])
)
# predict i'th response given i'th reduced observation
y.hat <- predict(logit, newdata = data.frame(r = r[i]), type = "response")
# report progress
cat(sprintf("dim: (%d, %d, %d) - %3d/%d\n",
dim(X)[1], dim(X)[2], dim(X)[3], i, length(y)
))
y.hat
}, mc.cores = getOption("mc.cores", max(1L, parallel::detectCores() - 1L))))
}
# perform preprocessed (reduced) and raw (not reduced) leave-one-out prediction
y.hat.3.4 <- loo.predict.gmlm(preprocess(X, 3, 4, 3), y)
y.hat.15.15 <- loo.predict.gmlm(preprocess(X, 15, 15, 3), y)
y.hat.20.30 <- loo.predict.gmlm(preprocess(X, 20, 30, 3), y)
y.hat <- loo.predict.gmlm(X, y)
# classification performance measures table by leave-one-out cross-validation
(loo.cv <- apply(cbind(y.hat.3.4, y.hat.15.15, y.hat.20.30, y.hat), 2, function(y.pred) {
sapply(c("acc", "err", "fpr", "tpr", "fnr", "tnr", "auc", "auc.sd"),
function(FUN) { match.fun(FUN)(as.integer(y) - 1L, y.pred) })
}))
#> y.hat.3.4 y.hat.15.15 y.hat.20.30 y.hat
#> acc 0.83606557 0.80327869 0.80327869 0.80327869
#> err 0.16393443 0.19672131 0.19672131 0.19672131
#> fpr 0.31111111 0.33333333 0.33333333 0.33333333
#> tpr 0.92207792 0.88311688 0.88311688 0.88311688
#> fnr 0.07792208 0.11688312 0.11688312 0.11688312
#> tnr 0.68888889 0.66666667 0.66666667 0.66666667
#> auc 0.88023088 0.87070707 0.87041847 0.86810967
#> auc.sd 0.03124875 0.03244623 0.03248653 0.03295883
################################## Tensor SIR ##################################
#' Leave-one-out prediction using TSIR
#'
#' @param X 3D EEG data (preprocessed or not)
#' @param y binary responce vector
loo.predict.tsir <- function(X, y) {
unlist(parallel::mclapply(seq_along(y), function(i) {
# Fit with i'th observation removed
fit <- TSIR(X[ , , , -i], y[-i], c(1L, 1L, 1L), sample.axis = 4L)
# Reduce the entire data set
r <- as.vector(mlm(X, fit, modes = 1:3, transpose = TRUE))
# Fit a logit model on reduced data with i'th observation removed
logit <- glm(y ~ r, family = binomial(link = "logit"),
data = data.frame(y = y[-i], r = r[-i])
)
# predict i'th response given i'th reduced observation
y.hat <- predict(logit, newdata = data.frame(r = r[i]), type = "response")
# report progress
cat(sprintf("dim: (%d, %d, %d) - %3d/%d\n",
dim(X)[1], dim(X)[2], dim(X)[3], i, length(y)
))
y.hat
}, mc.cores = getOption("mc.cores", max(1L, parallel::detectCores() - 1L))))
}
# perform preprocessed (reduced) and raw (not reduced) leave-one-out prediction
y.hat.3.4 <- loo.predict.tsir(preprocess(X, 3, 4, 3), y)
y.hat.15.15 <- loo.predict.tsir(preprocess(X, 15, 15, 3), y)
y.hat.20.30 <- loo.predict.tsir(preprocess(X, 20, 30, 3), y)
y.hat <- loo.predict.tsir(X, y)
# classification performance measures table by leave-one-out cross-validation
(loo.cv <- apply(cbind(y.hat.3.4, y.hat.15.15, y.hat.20.30, y.hat), 2, function(y.pred) {
sapply(c("acc", "err", "fpr", "tpr", "fnr", "tnr", "auc", "auc.sd"),
function(FUN) { match.fun(FUN)(as.integer(y) - 1L, y.pred) })
}))
#> y.hat.3.4 y.hat.15.15 y.hat.20.30 y.hat
#> acc 0.81967213 0.84426230 0.81147541 0.80327869
#> err 0.18032787 0.15573770 0.18852459 0.19672131
#> fpr 0.33333333 0.24444444 0.33333333 0.33333333
#> tpr 0.90909091 0.89610390 0.89610390 0.88311688
#> fnr 0.09090909 0.10389610 0.10389610 0.11688312
#> tnr 0.66666667 0.75555556 0.66666667 0.66666667
#> auc 0.86522367 0.89379509 0.88196248 0.85974026
#> auc.sd 0.03357539 0.03055047 0.02986038 0.03367847

View File

@ -0,0 +1,91 @@
# EEG Database
## Data Type
Multiple electrode time series EEG recordings of control and alcoholic subjects.
## Abstract
This data arises from a large study to examine EEG correlates of genetic predisposition to alcoholism. It contains measurements from 64 electrodes placed on the scalp sampled at 256 Hz (3.9-msec epoch) for 1 second.
## Sources
See: https://kdd.ics.uci.edu/databases/eeg/eeg.data.html
### Original Owner
Henri Begleiter <br/>
Neurodynamics Laboratory, <br/>
State University of New York Health Center <br/>
Brooklyn, New York
### Donor
Lester Ingber <br/>
POB 06440 Sears Tower <br/>
Chicago, IL 60606 <br/>
ingber@ingber.com
__Date Donated__: October 13, 1999
## Data Characteristics
This data arises from a large study to examine EEG correlates of genetic predisposition to alcoholism. It contains measurements from 64 electrodes placed on subject's scalps which were sampled at 256 Hz (3.9-msec epoch) for 1 second.
There were two groups of subjects: alcoholic and control. Each subject was exposed to either a single stimulus (S1) or to two stimuli (S1 and S2) which were pictures of objects chosen from the 1980 Snodgrass and Vanderwart picture set. When two stimuli were shown, they were presented in either a matched condition where S1 was identical to S2 or in a non-matched condition where S1 differed from S2.
Shown here are example plots of a control and alcoholic subject. The plots indicate voltage, time, and channel and are averaged over 10 trials for the single stimulus condition.
There were 122 subjects and each subject completed 120 trials where different stimuli were shown. The electrode positions were located at standard sites (Standard Electrode Position Nomenclature, American Electroencephalographic Association 1990). Zhang et al. (1995) describes in detail the data collection process.
## Data Format
There are three versions of the EEG data set.
### The Small Data Set
The small data set (smni97_eeg_data.tar.gz) contains data for the 2 subjects, alcoholic a_co2a0000364 and control c_co2c0000337. For each of the 3 matching paradigms, c_1 (one presentation only), c_m (match to previous presentation) and c_n (no-match to previous presentation), 10 runs are shown.
### The Large Data Set
The large data set (SMNI_CMI_TRAIN.tar.gz and SMNI_CMI_TEST.tar.gz) contains data for 10 alcoholic and 10 control subjects, with 10 runs per subject per paradigm. The test data used the same 10 alcoholic and 10 control subjects as with the training data, but with 10 out-of-sample runs per subject per paradigm.
### The Full Data Set
This data set contains all 120 trials for 122 subjects. The entire set of data is about 700 MBytes.
NOTE: There are 17 trials with empty files in co2c1000367. Some trials have `"err"` notices, e.g., search/grep for `"err"` and see `"S2 match err"` or `"S2 nomatch err"` etc.
### File Format
Each trial is stored in its own file and will appear in the following format.
```R
# co2a0000364.rd
# 120 trials, 64 chans, 416 samples 368 post_stim samples
# 3.906000 msecs uV
# S1 obj , trial 0
# FP1 chan 0
0 FP1 0 -8.921
0 FP1 1 -8.433
0 FP1 2 -2.574
0 FP1 3 5.239
0 FP1 4 11.587
0 FP1 5 14.028
...
```
The first four lines are header information. Line 1 contains the subject identifier and indicates if the subject was an alcholic (a) or control (c) subject by the fourth letter. Line 4 identifies the matching conditions: a single object shown (S1 obj), object 2 shown in a matching condition (S2 match), and object 2 shown in a non matching condition (S2 nomatch).
Line 5 identifies the start of the data from sensor FP1. The four columns of data are: the trial number, sensor position, sample number (0-255), and sensor value (in micro volts).
## Past Usage
X.L. Zhang, H. Begleiter, B. Porjesz, W. Wang, and A. Litke. (1995). "Event related potentials during object recognition tasks". _Brain Research Bulletin_. Volume 38. Number 6. Pages 531-538.
## Acknowledgements, Copyright Information, and Availability
There are no usage restrictions on this data.
Acknowledgments for this data should made to Henri Begleiter at the Neurodynamics Laboratory at the State University of New York Health Center at Brooklyn.
Plots are courtesy of Roger Gabriel.
## References and Further Information
- L. Ingber. (1997). "Statistical mechanics of neocortical interactions: Canonical momenta indicators of electroencephalography." _Physical Review E_. Volume 55. Number 4. Pages 4578-4593.
- L. Ingber. (1998). "Statistical mechanics of neocortical interactions: Training and testing canonical momenta indicators of EEG." _Mathematical Computer Modelling_. Volume 27. Number 3. Pages 33-64.
- J. G. Snodgrss and M. Vanderwart. (1980). "A standardized set of 260 pictures: norms for the naming agreement, familiarity, and visual complexity." _Journal of Experimental Psychology: Human Learning and Memory_. Volume 6. Pages 174-215.
The UCI KDD Archive <br/>
Information and Computer Science <br/>
University of California, Irvine <br/>
Irvine, CA 92697-3425 <br/>
Last modified: October 27, 1999.