library(tensorPredictors) # library(RGCCA) # for `mgcca` ### Load modified version which _does not_ create a clusster in case of ### `n_cores == 1` allowing huge speed improvements! (at least on Ubuntu 22.04 LTS) ### Moreover, it is compatible with `Rscript` devtools::load_all("~/Work/tensorPredictors/References/Software/TGCCA-modified", export_all = FALSE) setwd("~/Work/tensorPredictors/sim/") base.name <- format(Sys.time(), "sim_1e_normal-%Y%m%dT%H%M") # Source utility function used in most simulations (extracted for convenience) source("./sim_utils.R") # Set PRNG seed for reproducability # Note: `0x` is the HEX number prefix and the trailing `L` stands for "long" # which is `R`s way if indicating an integer. set.seed(0x1eL, "Mersenne-Twister", "Inversion", "Rejection") ### Simulation configuration reps <- 100 # number of simulation replications sample.sizes <- c(100, 200, 300, 500, 750) # sample sizes `n` dimX <- c(5, 5) # predictor `X` dimension dimF <- c(2, 2) Sigma <- 0.5^abs(outer(1:prod(dimX), 1:prod(dimX), `-`)) # AR(0.5) # # define projections onto tri-diagonal matrixes # proj.betas <- Map(.projRank, rep(1L, length(dimX))) # wrong assumption of low rank betas # proj.Omegas <- Map(.projBand, Map(c, dimX, dimX), 1L, 1L) # wrong assumption of band Scatter matrices # data sampling routine sample.data <- function(sample.size, dimX, dimF, B.true, Sigma) { y <- rnorm(sample.size) # the true F (in vectorized form) vecF <- rbind(1, sin(y), cos(y), sin(y) * cos(y)) # sample vectorized X as a multi-variate normal (in vectorized form) vecX <- B.true %*% vecF + t(chol(Sigma)) %*% matrix(rnorm(prod(sample.size, dimX)), prod(dimX)) X <- array(vecX, c(dimX, sample.size)) list(X, vecF, y, length(dim(X))) } # wrong assumption about the function `F(y)` F.wrong <- function(y) array(rbind(1, y, y, y^2), c(2, 2, length(y))) # Create a CSV logger to write simulation results to log.file <- paste(base.name, "csv", sep = ".") logger <- CSV.logger( file.name = log.file, header = c("sample.size", "rep", outer( c("time", "dist.subspace", "dist.projection"), # measures c("gmlm", "pca", "hopca", "tsir", "mgcca"), # methods paste, sep = "." )) ) # Miss-specify true beta as _not_ a kronecker product B.true <- diag(1, prod(dimX), prod(dimF)) ### for each sample size for (sample.size in sample.sizes) { # repeate every simulation for (rep in seq_len(reps)) { # Sample training data c(X, F.true, y, sample.axis) %<-% sample.data(sample.size, dimX, dimF, B.true, Sigma) # fit different models # Wrapped in try-catch clock to ensure the simulation continues, # if an error occures continue with nest resplication and log an error message tryCatch({ time.gmlm <- system.time( fit.gmlm <- gmlm_tensor_normal(X, F.wrong(y), sample.axis = sample.axis) )["user.self"] time.pca <- system.time( fit.pca <- prcomp(mat(X, sample.axis), rank. = prod(dimF)) )["user.self"] time.hopca <- system.time( fit.hopca <- HOPCA(X, npc = dimF, sample.axis = sample.axis) )["user.self"] time.tsir <- system.time( fit.tsir <- TSIR(X, y, dimF, sample.axis = sample.axis) )["user.self"] # `mgcca` expects the first axis to be the sample axis X1 <- aperm(X, c(sample.axis, seq_along(dim(X))[-sample.axis])) time.mgcca <- system.time( fit.mgcca <- mgcca( list(X1, y), quiet = TRUE, scheme = "factorial", ncomp = c(prod(dimF), 1), ranks = c(prod(dimF), 1) ) )["user.self"] }, error = function(ex) { print(ex) }) # Compute true reduction matrix B.gmlm <- with(fit.gmlm, Reduce(`%x%`, rev(betas))) B.pca <- fit.pca$rotation B.hopca <- Reduce(`%x%`, rev(fit.hopca)) B.tsir <- Reduce(`%x%`, rev(fit.tsir)) B.mgcca <- fit.mgcca$astar[[1]] # Subspace Distances: Normalized `|| P_A - P_B ||_F` where # `P_A = A (A' A)^-1 A'` and the normalization means that with # respect to the dimensions of `A, B` the subspace distance is in the # range `[0, 1]`. dist.subspace.gmlm <- dist.subspace(B.true, B.gmlm, normalize = TRUE) dist.subspace.pca <- dist.subspace(B.true, B.pca, normalize = TRUE) dist.subspace.hopca <- dist.subspace(B.true, B.hopca, normalize = TRUE) dist.subspace.tsir <- dist.subspace(B.true, B.tsir, normalize = TRUE) dist.subspace.mgcca <- dist.subspace(B.true, B.mgcca, normalize = TRUE) # Projection Distances: Spectral norm (2-norm) `|| P_A - P_B ||_2`. dist.projection.gmlm <- dist.projection(B.true, B.gmlm) dist.projection.pca <- dist.projection(B.true, B.pca) dist.projection.hopca <- dist.projection(B.true, B.hopca) dist.projection.tsir <- dist.projection(B.true, B.tsir) dist.projection.mgcca <- dist.projection(B.true, B.mgcca) # Call CSV logger writing results to file logger() # print progress cat(sprintf("sample size (%d): %d/%d - rep: %d/%d\n", sample.size, which(sample.size == sample.sizes), length(sample.sizes), rep, reps)) } } ### read simulation results generate plots if (!interactive()) { pdf(file = paste(base.name, "pdf", sep = ".")) } sim <- read.csv(log.file) # Remain sample size grouping variable to avoid conflicts aggr.mean <- aggregate(sim, list(sampleSize = sim$sample.size), mean) aggr.median <- aggregate(sim, list(sampleSize = sim$sample.size), median) aggr.sd <- aggregate(sim, list(sampleSize = sim$sample.size), sd) aggr.min <- aggregate(sim, list(sampleSize = sim$sample.size), min) aggr.max <- aggregate(sim, list(sampleSize = sim$sample.size), max) par(pch = 16, bty = "n", lty = "solid", lwd = 1.5) cols <- c(gmlm = "blue", pca = "darkcyan", hopca = "red", tsir = "darkgreen", mgcca = "purple") with(aggr.mean, { plot(range(sampleSize), c(0, 1), type = "n", main = "Subspace Distance", xlab = "Sample Size", ylab = "Distance" ) for (dist.name in ls(pattern = "^dist.subspace")) { mean <- get(dist.name) median <- aggr.median[aggr.sd$sampleSize == sampleSize, dist.name] sd <- aggr.sd[aggr.sd$sampleSize == sampleSize, dist.name] min <- aggr.min[aggr.sd$sampleSize == sampleSize, dist.name] max <- aggr.max[aggr.sd$sampleSize == sampleSize, dist.name] method <- tail(strsplit(dist.name, ".", fixed = TRUE)[[1]], 1) col <- cols[method] lines(sampleSize, mean, type = "o", col = col, lty = 1, lwd = 1) lines(sampleSize, mean + sd, col = col, lty = 2, lwd = 0.8) lines(sampleSize, mean - sd, col = col, lty = 2, lwd = 0.8) lines(sampleSize, median, col = col, lty = 1, lwd = 1) lines(sampleSize, min, col = col, lty = 3, lwd = 0.6) lines(sampleSize, max, col = col, lty = 3, lwd = 0.6) } legend("topright", col = cols, lty = 1, legend = names(cols), bty = "n", lwd = par("lwd"), pch = par("pch")) }) with(aggr.mean, { plot(range(sampleSize), c(0, 1), type = "n", main = "Projection Distance", xlab = "Sample Size", ylab = "Distance" ) for (dist.name in ls(pattern = "^dist.projection")) { mean <- get(dist.name) median <- aggr.median[aggr.sd$sampleSize == sampleSize, dist.name] sd <- aggr.sd[aggr.sd$sampleSize == sampleSize, dist.name] min <- aggr.min[aggr.sd$sampleSize == sampleSize, dist.name] max <- aggr.max[aggr.sd$sampleSize == sampleSize, dist.name] method <- tail(strsplit(dist.name, ".", fixed = TRUE)[[1]], 1) col <- cols[method] lines(sampleSize, mean, type = "o", col = col, lty = 1, lwd = 1) lines(sampleSize, mean + sd, col = col, lty = 2, lwd = 0.8) lines(sampleSize, mean - sd, col = col, lty = 2, lwd = 0.8) lines(sampleSize, median, col = col, lty = 1, lwd = 1) lines(sampleSize, min, col = col, lty = 3, lwd = 0.6) lines(sampleSize, max, col = col, lty = 3, lwd = 0.6) } legend("topright", col = cols, lty = 1, legend = names(cols), bty = "n", lwd = par("lwd"), pch = par("pch")) }) # sample.axis <- 1L # F.wrong <- array(outer(y, c(0, 1, 1, 2, 1, 2, 2, 3), `^`), c(10, 2, 2, 2)) # F.true <- array(c(1, sin(y), cos(y))) # 1 s c # s ss cs # c sc cc # s ss cs # ss sss css # cs scs ccs # c sc cc # sc ssc csc # cc scc ccc # osc <- # I <- array(0, c(3, 3, 3)) # slice.index()