# List of all sensor names/positions (in chanel order) to ensure all entries
# are places at the same sensor x time position in the matrix layout
sensors <- c(
    "FP1", "FP2", "F7",  "F8",  "AF1", "AF2", "FZ",  "F4",  "F3",  "FC6", "FC5",
    "FC2", "FC1", "T8",  "T7",  "CZ",  "C3",  "C4",  "CP5", "CP6", "CP1", "CP2",
    "P3",  "P4",  "PZ",  "P8",  "P7",  "PO2", "PO1", "O2",  "O1",  "X",   "AF7",
    "AF8", "F5",  "F6",  "FT7", "FT8", "FPZ", "FC4", "FC3", "C6",  "C5",  "F2",
    "F1",  "TP8", "TP7", "AFZ", "CP3", "CP4", "P5",  "P6",  "C1",  "C2",  "PO7",
    "PO8", "FCZ", "POZ", "OZ",  "P2",  "P1",  "CPZ", "nd",  "Y"
)

tmpdir <- tempdir()

untar("eeg_full.tar", exdir = tmpdir)           # uncompress
subjects <- untar("eeg_full.tar", list = TRUE)  # file names (only read)
subjects <- `names<-`(vector("list", length(subjects)), substr(subjects, 1, 11))

for (i in seq_along(subjects)) {
    subject <- names(subjects)[i]

    # Decompressed folder of trials for current subject
    untar(file.path(tmpdir, sprintf("%s.tar.gz", subject)), exdir = tmpdir)

    # Iterate all trial files of current subject
    X <- lapply(list.files(file.path(tmpdir, subject), full.names = TRUE),
        function(trial) {
            # Read leading meta data lines and data from gz compressed CSV file
            conn <- gzfile(trial)
            meta <- readLines(conn, 4)
            # If there are less than 4 entries in meta, the file does NOT contain
            # any data -> error, ignore trial
            if (length(meta) < 4) {
                return(NULL)
            }
            # Read data (measurements)
            data <- read.csv(conn,
                header = FALSE, sep = " ", comment.char = "#",
                col.names = c("trial", "sensor", "time", "volts"),
                colClasses = c("integer", "character", "integer", "numeric")
            )

            # Compute index of every entry (ensures same
            # placement of measurements in its matrix representation)
            data$sensor <- factor(data$sensor, levels = sensors)
            idx <- (as.integer(data$sensor) - 1L) * 256L + data$time + 1L
            # Check if every sensor at every time point is present to ensure
            # no interleaving or shifted data in final 3D array
            if ((length(idx) != 64 * 256) || any(sort(idx) != seq_along(idx))) {
                return(NULL)
            }

            # Return measurements in standardized order and attach meta info
            structure(data$volts[idx], meta = meta[4])
        }
    )

    # Count nr. of errors (return NULL) and remove from data
    file_error_idx <- which(sapply(X, is.null))
    if (length(file_error_idx)) {
        X[file_error_idx] <- NULL
    }

    # Extract meta information
    meta <- sapply(X, attr, "meta")

    # Check for error notification in meta data and drop them as well
    notice_error_idx <- grep(".*err.*", meta)
    if (length(notice_error_idx)) {
        X[notice_error_idx] <- NULL
        meta <- meta[-notice_error_idx]
    }

    # Split into trial condition and trial nr.
    condition <- factor(
        sub(".*(S1 obj|S2 match|S2 nomatch).*", meta, replacement = "\\1"),
        levels = c("S1 obj", "S2 match", "S2 nomatch")
    )

    # Concatinate individual trials in a 3D array (based on standardized order)
    X <- matrix(unlist(X), 256 * 64)

    # Track for reporting the nr. of non-finite values
    nr_non_finite <- sum(!is.finite(X))

    # Compute mean over trials grouped by trial condition
    X <- c(
        rowMeans(X[, condition == "S1 obj",     drop = FALSE], na.rm = TRUE),
        rowMeans(X[, condition == "S2 match",   drop = FALSE], na.rm = TRUE),
        rowMeans(X[, condition == "S2 nomatch", drop = FALSE], na.rm = TRUE)
    )

    # store mean of trials of current subject in list of subjects
    subjects[[subject]] <- X

    # Remove/Delete subject file and decompressed folder
    unlink(file.path(tmpdir, subject), recursive = TRUE)
    unlink(file.path(tmpdir, sprintf("%s.tar.gz", subject)))

    # Report progress
    cat(sprintf(
        "%5d/%d - Nr. trials: %3d = %3d + %3d + %3d%s%s\n",
        i, length(subjects),
        length(condition),
        sum(condition == "S1 obj"),
        sum(condition == "S2 match"),
        sum(condition == "S2 nomatch"),
        if (nr_non_finite) sprintf(", Nr. non-finite: %d", nr_non_finite) else "",
        if (length(file_error_idx) && length(notice_error_idx)) {
            sprintf(", %d file and %d notice errors (%s) -> trials dropped",
                length(file_error_idx), length(notice_error_idx), subject
            )
        } else if (length(file_error_idx)) {
            sprintf(", %d file errors (%s) -> trials dropped",
                length(file_error_idx), subject
            )
        } else if (length(notice_error_idx)) {
            sprintf(", %d notice errors (%s) -> trials dropped",
                length(notice_error_idx), subject
            )
        } else {
            ""
        }
    ))
}

# Combine subjects in single 4D tensor (time x sensor x condition x subject)
X <- array(
    unlist(subjects),
    dim = c(time = 256, sensor = 64, condition = 3, subject = length(subjects)),
    dimnames = list(
        time = 1:256,
        sensor = sensors,
        condition = c("S1 obj", "S2 match", "S2 nomatch"),
        subject = names(subjects)
    )
)

# Extract alcoholic or control labels for each subject
y <- factor(
    substr(names(subjects), 4, 4),
    levels = c("c", "a"),
    labels = c("control", "alcoholic")
)
names(y) <- names(subjects)

# Save full processed EEG dataset as R data file
saveRDS(list(X = X, y = y), file = "eeg_data_3d.rds")
saveRDS(list(X = X[, , "S1 obj", ], y = y), file = "eeg_data_2d.rds")