tensor_predictors/dataAnalysis/eeg/01_eeg_preprocess.R

149 lines
5.6 KiB
R

# List of all sensor names/positions (in chanel order) to ensure all entries
# are places at the same sensor x time position in the matrix layout
sensors <- c(
"FP1", "FP2", "F7", "F8", "AF1", "AF2", "FZ", "F4", "F3", "FC6", "FC5",
"FC2", "FC1", "T8", "T7", "CZ", "C3", "C4", "CP5", "CP6", "CP1", "CP2",
"P3", "P4", "PZ", "P8", "P7", "PO2", "PO1", "O2", "O1", "X", "AF7",
"AF8", "F5", "F6", "FT7", "FT8", "FPZ", "FC4", "FC3", "C6", "C5", "F2",
"F1", "TP8", "TP7", "AFZ", "CP3", "CP4", "P5", "P6", "C1", "C2", "PO7",
"PO8", "FCZ", "POZ", "OZ", "P2", "P1", "CPZ", "nd", "Y"
)
tmpdir <- tempdir()
untar("eeg_full.tar", exdir = tmpdir) # uncompress
subjects <- untar("eeg_full.tar", list = TRUE) # file names (only read)
subjects <- `names<-`(vector("list", length(subjects)), substr(subjects, 1, 11))
for (i in seq_along(subjects)) {
subject <- names(subjects)[i]
# Decompressed folder of trials for current subject
untar(file.path(tmpdir, sprintf("%s.tar.gz", subject)), exdir = tmpdir)
# Iterate all trial files of current subject
X <- lapply(list.files(file.path(tmpdir, subject), full.names = TRUE),
function(trial) {
# Read leading meta data lines and data from gz compressed CSV file
conn <- gzfile(trial)
meta <- readLines(conn, 4)
# If there are less than 4 entries in meta, the file does NOT contain
# any data -> error, ignore trial
if (length(meta) < 4) {
return(NULL)
}
# Read data (measurements)
data <- read.csv(conn,
header = FALSE, sep = " ", comment.char = "#",
col.names = c("trial", "sensor", "time", "volts"),
colClasses = c("integer", "character", "integer", "numeric")
)
# Compute index of every entry (ensures same
# placement of measurements in its matrix representation)
data$sensor <- factor(data$sensor, levels = sensors)
idx <- (as.integer(data$sensor) - 1L) * 256L + data$time + 1L
# Check if every sensor at every time point is present to ensure
# no interleaving or shifted data in final 3D array
if ((length(idx) != 64 * 256) || any(sort(idx) != seq_along(idx))) {
return(NULL)
}
# Return measurements in standardized order and attach meta info
structure(data$volts[idx], meta = meta[4])
}
)
# Count nr. of errors (return NULL) and remove from data
file_error_idx <- which(sapply(X, is.null))
if (length(file_error_idx)) {
X[file_error_idx] <- NULL
}
# Extract meta information
meta <- sapply(X, attr, "meta")
# Check for error notification in meta data and drop them as well
notice_error_idx <- grep(".*err.*", meta)
if (length(notice_error_idx)) {
X[notice_error_idx] <- NULL
meta <- meta[-notice_error_idx]
}
# Split into trial condition and trial nr.
condition <- factor(
sub(".*(S1 obj|S2 match|S2 nomatch).*", meta, replacement = "\\1"),
levels = c("S1 obj", "S2 match", "S2 nomatch")
)
# Concatinate individual trials in a 3D array (based on standardized order)
X <- matrix(unlist(X), 256 * 64)
# Track for reporting the nr. of non-finite values
nr_non_finite <- sum(!is.finite(X))
# Compute mean over trials grouped by trial condition
X <- c(
rowMeans(X[, condition == "S1 obj", drop = FALSE], na.rm = TRUE),
rowMeans(X[, condition == "S2 match", drop = FALSE], na.rm = TRUE),
rowMeans(X[, condition == "S2 nomatch", drop = FALSE], na.rm = TRUE)
)
# store mean of trials of current subject in list of subjects
subjects[[subject]] <- X
# Remove/Delete subject file and decompressed folder
unlink(file.path(tmpdir, subject), recursive = TRUE)
unlink(file.path(tmpdir, sprintf("%s.tar.gz", subject)))
# Report progress
cat(sprintf(
"%5d/%d - Nr. trials: %3d = %3d + %3d + %3d%s%s\n",
i, length(subjects),
length(condition),
sum(condition == "S1 obj"),
sum(condition == "S2 match"),
sum(condition == "S2 nomatch"),
if (nr_non_finite) sprintf(", Nr. non-finite: %d", nr_non_finite) else "",
if (length(file_error_idx) && length(notice_error_idx)) {
sprintf(", %d file and %d notice errors (%s) -> trials dropped",
length(file_error_idx), length(notice_error_idx), subject
)
} else if (length(file_error_idx)) {
sprintf(", %d file errors (%s) -> trials dropped",
length(file_error_idx), subject
)
} else if (length(notice_error_idx)) {
sprintf(", %d notice errors (%s) -> trials dropped",
length(notice_error_idx), subject
)
} else {
""
}
))
}
# Combine subjects in single 4D tensor (time x sensor x condition x subject)
X <- array(
unlist(subjects),
dim = c(time = 256, sensor = 64, condition = 3, subject = length(subjects)),
dimnames = list(
time = 1:256,
sensor = sensors,
condition = c("S1 obj", "S2 match", "S2 nomatch"),
subject = names(subjects)
)
)
# Extract alcoholic or control labels for each subject
y <- factor(
substr(names(subjects), 4, 4),
levels = c("c", "a"),
labels = c("control", "alcoholic")
)
names(y) <- names(subjects)
# Save full processed EEG dataset as R data file
saveRDS(list(X = X, y = y), file = "eeg_data_3d.rds")
saveRDS(list(X = X[, , "S1 obj", ], y = y), file = "eeg_data_2d.rds")