diff --git a/LaTeX/main.bib b/LaTeX/main.bib index b912c2d..09b7a8c 100644 --- a/LaTeX/main.bib +++ b/LaTeX/main.bib @@ -2392,7 +2392,7 @@ @misc{lichess-database, - author = {Duplessis, Thibault}, + author = {Thibault Duplessis}, year = {2013}, title = {lichess.org open database}, url = {https://database.lichess.org}, diff --git a/dataAnalysis/eeg/02_eeg_2d.R b/dataAnalysis/eeg/02_eeg_2d.R index e2d7ae7..eb901db 100644 --- a/dataAnalysis/eeg/02_eeg_2d.R +++ b/dataAnalysis/eeg/02_eeg_2d.R @@ -52,10 +52,10 @@ c(X, y) %<-% readRDS("eeg_data_2d.rds") #' #' @param X 3D EEG data (preprocessed or not) #' @param F binary responce `y` as a 3D tensor, every obs. is a 1 x 1 matrix -loo.predict.gmlm <- function(X, y) { +loo.predict.gmlm <- function(X, y, ...) { unlist(parallel::mclapply(seq_along(y), function(i) { # Fit with i'th observation removed - fit <- gmlm_tensor_normal(X[ , , -i], as.integer(y[-i]), sample.axis = 3L) + fit <- gmlm_tensor_normal(X[ , , -i], as.integer(y[-i]), sample.axis = 3L, ...) # Reduce the entire data set r <- as.vector(mlm(X, fit$betas, modes = 1:2, transpose = TRUE)) @@ -75,26 +75,35 @@ loo.predict.gmlm <- function(X, y) { }, mc.cores = getOption("mc.cores", max(1L, parallel::detectCores() - 1L)))) } +proj.fft <- function(beta1, nr.freq = 5L) { + F <- fft(beta1) + Re(fft(`[<-`(F, head(order(abs(F)), -nr.freq), 0+0i), inverse = TRUE)) / length(F) +} + # perform preprocessed (reduced) and raw (not reduced) leave-one-out prediction y.hat.3.4 <- loo.predict.gmlm(preprocess(X, 3, 4), y) y.hat.15.15 <- loo.predict.gmlm(preprocess(X, 15, 15), y) y.hat.20.30 <- loo.predict.gmlm(preprocess(X, 20, 30), y) y.hat <- loo.predict.gmlm(X, y) +y.hat.fft <- loo.predict.gmlm(X, y, proj.betas = list(proj.fft, NULL)) # classification performance measures table by leave-one-out cross-validation -(loo.cv <- apply(cbind(y.hat.3.4, y.hat.15.15, y.hat.20.30, y.hat), 2, function(y.pred) { - sapply(c("acc", "err", "fpr", "tpr", "fnr", "tnr", "auc", "auc.sd"), - function(FUN) { match.fun(FUN)(as.integer(y) - 1L, y.pred) }) -})) -#> y.hat.3.4 y.hat.15.15 y.hat.20.30 y.hat -#> acc 0.79508197 0.78688525 0.78688525 0.78688525 -#> err 0.20491803 0.21311475 0.21311475 0.21311475 -#> fpr 0.35555556 0.40000000 0.40000000 0.40000000 -#> tpr 0.88311688 0.89610390 0.89610390 0.89610390 -#> fnr 0.11688312 0.10389610 0.10389610 0.10389610 -#> tnr 0.64444444 0.60000000 0.60000000 0.60000000 -#> auc 0.85108225 0.83838384 0.83924964 0.83896104 -#> auc.sd 0.03584791 0.03760531 0.03751307 0.03754553 +(loo.cv <- apply( + cbind(y.hat.3.4, y.hat.15.15, y.hat.20.30, y.hat, y.hat.fft), 2, + function(y.pred) { + sapply(c("acc", "err", "fpr", "tpr", "fnr", "tnr", "auc", "auc.sd"), + function(FUN) { match.fun(FUN)(as.integer(y) - 1L, y.pred) }) + } +)) +#> y.hat.3.4 y.hat.15.15 y.hat.20.30 y.hat y.hat.fft +#> acc 0.79508197 0.78688525 0.78688525 0.78688525 0.81147541 +#> err 0.20491803 0.21311475 0.21311475 0.21311475 0.18852459 +#> fpr 0.35555556 0.40000000 0.40000000 0.40000000 0.33333333 +#> tpr 0.88311688 0.89610390 0.89610390 0.89610390 0.89610390 +#> fnr 0.11688312 0.10389610 0.10389610 0.10389610 0.10389610 +#> tnr 0.64444444 0.60000000 0.60000000 0.60000000 0.66666667 +#> auc 0.85194805 0.83838384 0.83924964 0.83896104 0.84646465 +#> auc.sd 0.03574475 0.03760531 0.03751754 0.03754553 0.03751864 ################################## Tensor SIR ################################## diff --git a/dataAnalysis/eeg/03_eeg_3d.R b/dataAnalysis/eeg/03_eeg_3d.R index fe3709d..ce7d384 100644 --- a/dataAnalysis/eeg/03_eeg_3d.R +++ b/dataAnalysis/eeg/03_eeg_3d.R @@ -1,6 +1,8 @@ library(tensorPredictors) -library(parallel) -library(pROC) +suppressPackageStartupMessages({ + library(parallel) + library(pROC) +}) #' Mode-Wise PCA preprocessing (generalized (2D)^2 PCA) #' @@ -43,28 +45,6 @@ auc.sd <- function(y.true, y.pred) { } -# # unified API for all reduction procedures -# GMLM <- list( -# fit = function(X, y) tensorPredictors::gmlm_tensor_normal(X, as.integer(y), sample.axis = 4L), -# reduce = function(X, fit) mlm(X, fit$betas, 1:3, TRUE), -# applicable = function(X) TRUE -# ) -# TSIR <- list( -# fit = function(X, y) tensorPredictors::TSIR(X, y, c(1L, 1L, 1L), sample.axis = 4L), -# reduce = function(X, fit) mlm(X, fit, 1:3, TRUE), -# applicable = function(X) TRUE -# ) -# KPIR_LS <- list( -# fit = function(X, y) { -# if (any(dim(X)[-4] > dim(X)[4])) { -# stop("Dimensions too big") -# } -# tensorPredictors::kpir.ls(X, as.integer(y), sample.axis = 4L) -# }, -# reduce = function(X, fit) if (is.null(fit)) NA else mlm(X, fit$alphas, 1:3, TRUE), -# applicable = function(X) all(dim(X)[1:3] <= dim(X)[4]) -# ) - #' Leave-one-out prediction using TSIR #' #' @param method reduction method to be applied @@ -98,6 +78,12 @@ loo.predict <- function(method, X, y, ...) { }, mc.cores = getOption("mc.cores", max(1L, parallel::detectCores() - 1L)))) } +# "Projects" a sequence to its first `nr.freq` frequency components +proj.fft <- function(sequence, nr.freq = 5L) { + F <- fft(sequence) + Re(fft(`[<-`(F, head(order(abs(F)), -nr.freq), 0+0i), inverse = TRUE)) / length(F) +} + # Load full EEG dataset (3D tensor for each subject) c(X, y) %<-% readRDS("eeg_data_3d.rds") @@ -110,21 +96,22 @@ y.hat.3.4 <- loo.predict(gmlm_tensor_normal, preprocess(X, 3, 4, 3), y) y.hat.15.15 <- loo.predict(gmlm_tensor_normal, preprocess(X, 15, 15, 3), y) y.hat.20.30 <- loo.predict(gmlm_tensor_normal, preprocess(X, 20, 30, 3), y) y.hat <- loo.predict(gmlm_tensor_normal, X, y) +y.hat.fft <- loo.predict(gmlm_tensor_normal, X, y, proj.betas = list(proj.fft, NULL, NULL)) # classification performance measures table by leave-one-out cross-validation -(loo.cv <- apply(cbind(y.hat.3.4, y.hat.15.15, y.hat.20.30, y.hat), 2, function(y.pred) { +(loo.cv <- apply(cbind(y.hat.3.4, y.hat.15.15, y.hat.20.30, y.hat, y.hat.fft), 2, function(y.pred) { sapply(c("acc", "err", "fpr", "tpr", "fnr", "tnr", "auc", "auc.sd"), function(FUN) { match.fun(FUN)(as.integer(y) - 1L, y.pred) }) })) -#> y.hat.3.4 y.hat.15.15 y.hat.20.30 y.hat -#> acc 0.83606557 0.80327869 0.80327869 0.79508197 -#> err 0.16393443 0.19672131 0.19672131 0.20491803 -#> fpr 0.31111111 0.33333333 0.33333333 0.35555556 -#> tpr 0.92207792 0.88311688 0.88311688 0.88311688 -#> fnr 0.07792208 0.11688312 0.11688312 0.11688312 -#> tnr 0.68888889 0.66666667 0.66666667 0.64444444 -#> auc 0.88051948 0.86984127 0.86926407 0.86810967 -#> auc.sd 0.03118211 0.03254642 0.03259186 0.03295883 +#> y.hat.3.4 y.hat.15.15 y.hat.20.30 y.hat y.hat.fft +#> acc 0.83606557 0.80327869 0.80327869 0.79508197 0.79508197 +#> err 0.16393443 0.19672131 0.19672131 0.20491803 0.20491803 +#> fpr 0.31111111 0.33333333 0.33333333 0.35555556 0.33333333 +#> tpr 0.92207792 0.88311688 0.88311688 0.88311688 0.87012987 +#> fnr 0.07792208 0.11688312 0.11688312 0.11688312 0.12987013 +#> tnr 0.68888889 0.66666667 0.66666667 0.64444444 0.66666667 +#> auc 0.88051948 0.86984127 0.86926407 0.86810967 0.86810967 +#> auc.sd 0.03118211 0.03254642 0.03259186 0.03295883 0.03354029 ################################## Tensor SIR ################################## diff --git a/sim/sim_efficiency.R b/sim/sim_efficiency.R new file mode 100644 index 0000000..ca3e10b --- /dev/null +++ b/sim/sim_efficiency.R @@ -0,0 +1,182 @@ +library(tensorPredictors) + + +# Set PRNG seed to the first 4 digits of the golden ratio for reproducability +set.seed(1618L, "Mersenne-Twister", "Inversion", "Rejection") + +### Simulation configuration +reps <- 100 # number of simulation replications +sample.sizes <- c(100, 200, 300, 500, 750) # sample sizes `n` + +# Parameterize the "true" reductions on the dimension +gen.beta <- function(pj) { + as.matrix((-1)^seq_len(pj)) +} +# the precision matrices are simply diag(pj) + + +# sampling from the conditional matrix normal `X | Y = y ~ N(mu(y), I_{p1 p2})` +sample.data <- function(sample.size, betas, Omegas, eta1 = 0) { + # responce is a standard normal variable + y <- rnorm(sample.size) + # F(y) is identical to y, except its a tensor (last axis is sample axis) + F <- array(y, dim = c(mapply(ncol, betas), sample.size)) + + # sample predictors from tensor normal X | Y = y (last axis is sample axis) + sample.axis <- length(betas) + 1L + Deltas <- Map(solve, Omegas) # normal covariances + mu_y <- mlm(mlm(F, betas) + as.vector(eta1), Deltas) # conditional mean + X <- mu_y + rtensornorm(sample.size, 0, Deltas, sample.axis) # responses X + + list(X = X, F = F, y = y, sample.axis = sample.axis) +} + + +# Open simulation CSV log file +log.name <- format(Sys.time(), "sim_efficiency-%Y%m%dT%H%M.csv") +log.file <- file(log.name, "w") +# Counts new number of writes purely here to write the CSV header the first time +log.writes <- 0L + +# Setting p1 = p2 = pj (note, in the paper `p = p1 p2`) +mode.dims <- round(1.2^unique(round(logb(2:200, 1.2)))) +for (pj in mode.dims) { + + betas.true <- list(gen.beta(pj), gen.beta(pj)) + B.true <- kronecker(betas.true[[2]], betas.true[[1]]) + Omegas.true <- list(diag(pj), diag(pj)) + + for (sample.size in sample.sizes) { + + sim <- sapply(seq_len(reps), function(.) { + c(X, F, y, sample.axis) %<-% sample.data(sample.size, betas.true, Omegas.true) + + ds.lm <- tryCatch({ + unname(lm.fit(t(`dim<-`(X, c(pj^2, sample.size))), drop(F))$coefficients) + dist.subspace(B.true, B.lm, normalize = TRUE) + }, error = function(.) NA) + + c(., betas.vec, Omegas.vec) %<-% gmlm_tensor_normal(`dim<-`(X, c(pj^2, sample.size)), drop(F)) + + c(., betas.gmlm, Omegas.gmlm) %<-% gmlm_tensor_normal(X, F) + + c(., betas.mani, Omegas.mani) %<-% gmlm_tensor_normal(X, F, + proj.Omegas = rep(list(function(O) { diag(mean(diag(O)), nrow(O)) }), 2) + ) + + + ds.vec <- dist.subspace(B.true, betas.vec[[1]], normalize = TRUE) + ds.gmlm <- dist.subspace(betas.true, betas.gmlm, normalize = TRUE) # equiv to R> dist.subspace(B.true, B.gmlm) + ds.mani <- dist.subspace(betas.true, betas.mani, normalize = TRUE) + + c(lm = ds.lm, vec = ds.vec, gmlm = ds.gmlm, mani = ds.mani) + }) + + sim <- as.data.frame(t(sim)) + sim$sample.size <- sample.size + sim$pj <- pj + + # boxplot(t(sim)) + # summary(t(sim)) + + # Append current simulation results to log-file + write.table(sim, file = log.file, sep = ",", + row.names = FALSE, col.names = (log.writes <- log.writes + 1L) < 2L + ) + + # print progress + cat(sprintf("mode dim (%d): %d/%d - sample size (%d): %d/%d\n", + pj, which(pj == mode.dims), length(mode.dims), + sample.size, which(sample.size == sample.sizes), length(sample.sizes) + )) + } +} +close(log.file) + + +# Read simulation data back in +sim <- read.csv(log.name) + +# with(aggregate(sim, . ~ sample.size + pj, mean), { +# plot(range(pj), range(c(vec, gmlm, mani)), type = "n", +# main = "Simulation -- Efficiency Gain", +# xlab = expression(tilde(p)), +# ylab = expression(d(B, hat(B))) +# ) +# for (sz in sort(unique(sample.size))) { +# i <- order(pj)[sample.size == sz] +# lines(pj[i], vec[i], type = "b", pch = 16, col = sz %/% 100, lty = 1) +# lines(pj[i], gmlm[i], type = "b", pch = 16, col = sz %/% 100, lty = 2) +# lines(pj[i], mani[i], type = "b", pch = 16, col = sz %/% 100, lty = 3) +# } +# sd <- aggregate(sim, . ~ sample.size + pj, sd) +# }) + +with(merge( + aggregate(sim[names(sim) != "lm"], . ~ sample.size + pj, mean), + aggregate(sim[names(sim) != "lm"], . ~ sample.size + pj, sd), + by = c("sample.size", "pj"), + suffixes = c("", ".sd"), + all = FALSE +), { + plot(range(pj), range(c(vec, gmlm, mani)), type = "n", + main = "Simulation -- Efficiency Gain", + xlab = expression(tilde(p)), + ylab = expression(d(B, hat(B))) + ) + # colors <- c("#cf7d03ff", "#002d8d", "#006e18") + # col.idx <- 0L + lty.idx <- 0L + for (sz in sort(unique(sample.size))) { + i <- order(pj)[(sample.size == sz)[order(pj)]] + # polygon(c(pj[i], rev(pj[i])), c(lm[i] + lm.sd[i], rev(lm[i] - lm.sd[i])), + # col = paste0("#cf7d03", "50"), border = NA + # ) + polygon(c(pj[i], rev(pj[i])), c(vec[i] + vec.sd[i], rev(vec[i] - vec.sd[i])), + col = paste0("#b30303", "50"), border = NA + ) + polygon(c(pj[i], rev(pj[i])), c(gmlm[i] + gmlm.sd[i], rev(gmlm[i] - gmlm.sd[i])), + col = paste0("#002d8d", "50"), border = NA + ) + polygon(c(pj[i], rev(pj[i])), c(mani[i] + mani.sd[i], rev(mani[i] - mani.sd[i])), + col = paste0("#006e18", "50"), border = NA + ) + } + lty.idx <- 1L + for (sz in sort(unique(sample.size))) { + i <- order(pj)[(sample.size == sz)[order(pj)]] + # lines(pj[i], lm[i], type = "b", pch = 16, col = "#cf7d03", lty = lty.idx, lwd = 2) + lines(pj[i], vec[i], type = "b", pch = 16, col = "#b30303", lty = lty.idx, lwd = 2) + lines(pj[i], gmlm[i], type = "b", pch = 16, col = "#002d8d", lty = lty.idx, lwd = 2) + lines(pj[i], mani[i], type = "b", pch = 16, col = "#006e18", lty = lty.idx, lwd = 2) + lty.idx <- lty.idx + 1L + } +}) + + + +# unname(lm.fit(t(`dim<-`(X, c(pj^2, sample.size))), drop(F))$coefficients) +# unname(lm(drop(F) ~ t(`dim<-`(X, c(pj^2, sample.size))) - 1)$coefficients) + + + + +# require(utils) + +# set.seed(129) + +# n <- 7 ; p <- 2 +# X <- matrix(rnorm(n * p), n, p) # no intercept! +# y <- rnorm(n) +# w <- rnorm(n)^2 + +# str(lmw <- lm.wfit(x = X, y = y, w = w)) + +# str(lm. <- lm.fit (x = X, y = y)) + + +# if(require("microbenchmark")) { +# mb <- microbenchmark(lm(y~X), lm.fit(X,y), .lm.fit(X,y)) +# print(mb) +# boxplot(mb, notch=TRUE) +# } \ No newline at end of file diff --git a/tensorPredictors/R/gmlm_tensor_normal.R b/tensorPredictors/R/gmlm_tensor_normal.R index da7c4ac..b0ce71d 100644 --- a/tensorPredictors/R/gmlm_tensor_normal.R +++ b/tensorPredictors/R/gmlm_tensor_normal.R @@ -114,9 +114,9 @@ gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)), Sigmas[[j]] <- Sigmas[[j]] + diag(0.2 * min_max[2], nrow(Sigmas[[j]])) } } - # Compute (unconstraint but regularized) Omega_j as covariance inverse + # Compute (unconstraint) Omega_j's as covariance inverse Omegas[[j]] <- solve(Sigmas[[j]]) - # Project Omega_j to the Omega_j's manifold + # Project Omega_j's to their manifolds if (is.function(proj_j <- proj.Omegas[[j]])) { Omegas[[j]] <- proj_j(Omegas[[j]]) # Reverse computation of `Sigma_j` as inverse of `Omega_j` diff --git a/tensorPredictors/R/kpir_ls.R b/tensorPredictors/R/kpir_ls.R index 96a4270..fe910b4 100644 --- a/tensorPredictors/R/kpir_ls.R +++ b/tensorPredictors/R/kpir_ls.R @@ -44,14 +44,14 @@ kpir.ls <- function(X, Fy, max.iter = 20L, sample.axis = 1L, }, modes, dim(Fy)[modes]) - ### Step 2: iterate per mode (axis) least squares estimates + ### Step 2: iterate per mode (axis) least squares estimates for (iter in seq_len(max.iter)) { - + # Invoke logger for previous iterate if (is.function(logger)) { logger("ls", iter - 1L, alphas) } - + # cyclic iterate over modes for (j in seq_along(modes)) { # least squares solution for `alpha_j | alpha_i, i != j`