From 39555ec1fa0e61da7030565920e3c05c6b8e6bdf Mon Sep 17 00:00:00 2001 From: daniel Date: Tue, 21 Nov 2023 12:25:35 +0100 Subject: [PATCH] add: tensor normal simulations --- .gitignore | 9 ++ sim/sim_1a_normal.R | 148 ++++++++++++++++++++ sim/sim_1b_normal.R | 149 ++++++++++++++++++++ sim/sim_1b_normal_2.R | 185 +++++++++++++++++++++++++ sim/sim_1c_normal.R | 152 ++++++++++++++++++++ sim/sim_1d_normal.R | 157 +++++++++++++++++++++ sim/sim_1e_normal.R | 231 +++++++++++++++++++++++++++++++ sim/sim_2a_ising.R | 315 ++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 1346 insertions(+) create mode 100644 sim/sim_1a_normal.R create mode 100644 sim/sim_1b_normal.R create mode 100644 sim/sim_1b_normal_2.R create mode 100644 sim/sim_1c_normal.R create mode 100644 sim/sim_1d_normal.R create mode 100644 sim/sim_1e_normal.R create mode 100644 sim/sim_2a_ising.R diff --git a/.gitignore b/.gitignore index ca4e0d5..542b420 100644 --- a/.gitignore +++ b/.gitignore @@ -108,8 +108,17 @@ simulations/ !**/LaTeX/*.bib **/LaTeX/*-blx.bib +# Include subfolders for images and plots +!**/LaTeX/plots/ +**/LaTeX/plots/* +!**/LaTeX/plots/*.tex +!**/LaTeX/images/ +**/LaTeX/images/* +!**/LaTeX/images/*.tex + mlda_analysis/ References/ +dataAnalysis/ *.csv *.csv.log diff --git a/sim/sim_1a_normal.R b/sim/sim_1a_normal.R new file mode 100644 index 0000000..d266793 --- /dev/null +++ b/sim/sim_1a_normal.R @@ -0,0 +1,148 @@ +library(tensorPredictors) +# library(RGCCA) +### Load modified version which _does not_ create a clusster in case of +### `n_cores == 1` allowing huge speed improvements! (at least on Ubuntu 22.04 LTS) +### Moreover, it is compatible with `Rscript` +devtools::load_all("~/Work/tensorPredictors/References/Software/TGCCA-modified", export_all = FALSE) + + +setwd("~/Work/tensorPredictors/sim/") +base.name <- format(Sys.time(), "sim_1a_normal-%Y%m%dT%H%M") + +# Source utility function used in most simulations (extracted for convenience) +source("./sim_utils.R") + +# Set PRNG seed for reproducability +# Note: `0x` is the HEX number prefix and the trailing `L` stands for "long" +# which is `R`s way if indicating an integer. +set.seed(0x1aL, "Mersenne-Twister", "Inversion", "Rejection") + +### Simulation configuration +reps <- 100 # number of simulation replications +sample.sizes <- c(100, 200, 300, 500, 750) # sample sizes `n` +dimX <- c(2, 3, 5) # predictor `X` dimension +dimF <- rep(1, length(dimX)) # "function" `F(y)` of responce `y` dimension + +# setup true model parameters +betas <- Map(diag, 1, dimX, dimF) +Omegas <- Map(function(pj) 0.5^abs(outer(1:pj, 1:pj, `-`)), dimX) # AR(0.5) +eta1 <- 0 + +# data sampling routine +sample.data <- function(sample.size, eta1, betas, Omegas) { + # responce is a standard normal variable + y <- rnorm(sample.size) + # F(y) is identical to y, except its a tensor (last axis is sample axis) + F <- array(y, dim = c(mapply(ncol, betas), sample.size)) + + # sample predictors from tensor normal X | Y = y (last axis is sample axis) + sample.axis <- length(betas) + 1L + Deltas <- Map(solve, Omegas) # normal covariances + mu_y <- mlm(mlm(F, betas) + as.vector(eta1), Deltas) # conditional mean + X <- mu_y + rtensornorm(sample.size, 0, Deltas, sample.axis) # responses X + + list(X = X, F = F, y = y, sample.axis = sample.axis) +} + +# Create a CSV logger to write simulation results to +log.file <- paste(base.name, "csv", sep = ".") +logger <- CSV.logger( + file.name = log.file, + header = c("sample.size", "rep", outer( + c("time", "dist.subspace"), # measures + c("gmlm", "pca", "hopca", "tsir", "mgcca", "hocca"), # methods + paste, sep = "." + ), "dist.subspace.init") +) + +# compute true (full) model parameters to compair estimates against +B.true <- Reduce(`%x%`, rev(betas)) + +### for each sample size +for (sample.size in sample.sizes) { + # repeate every simulation + for (rep in seq_len(reps)) { + # Sample training data + c(X, F, y, sample.axis) %<-% sample.data(sample.size, eta1, betas, Omegas) + + # fit different models + # Wrapped in try-catch clock to ensure the simulation continues, + # if an error occures continue with nest resplication and log an error message + tryCatch({ + time.gmlm <- system.time( + fit.gmlm <- gmlm_tensor_normal(X, F, sample.axis = sample.axis) + )["user.self"] + time.pca <- system.time( + fit.pca <- prcomp(mat(X, sample.axis), rank. = prod(dimF)) + )["user.self"] + time.hopca <- system.time( + fit.hopca <- HOPCA(X, npc = dimF, sample.axis = sample.axis) + )["user.self"] + time.tsir <- system.time( + fit.tsir <- TSIR(X, y, dimF, sample.axis = sample.axis) + )["user.self"] + # `mgcca` expects the first axis to be the sample axis + X1 <- aperm(X, c(sample.axis, seq_along(dim(X))[-sample.axis])) + F1 <- aperm(F, c(sample.axis, seq_along(dim(X))[-sample.axis])) + time.mgcca <- system.time( + fit.mgcca <- mgcca( + list(X1, drop(F1)), # `drop` removes 1D axis (exception without) + quiet = TRUE, + scheme = "factorial", + ncomp = rep(prod(dimF), 2), + ranks = rep(prod(dimF), 2) + ) + )["user.self"] + time.hocca <- system.time( + fit.hocca <- HOCCA(X, F, sample.axis = sample.size) + ) + }, error = function(ex) { + print(ex) + }) + + # Compute true reduction matrix + B.gmlm <- with(fit.gmlm, Reduce(`%x%`, rev(betas))) + B.init <- Reduce(`%x%`, rev(fit.gmlm$betas.init)) + B.pca <- fit.pca$rotation + B.hopca <- Reduce(`%x%`, rev(fit.hopca)) + B.tsir <- Reduce(`%x%`, rev(fit.tsir)) + B.mgcca <- fit.mgcca$astar[[1]] + B.hocca <- Reduce(`%x%`, rev(fit.hocca$dirsX)) + + # Subspace Distances: Normalized `|| P_A - P_B ||_F` where + # `P_A = A (A' A)^-1 A'` and the normalization means that with + # respect to the dimensions of `A, B` the subspace distance is in the + # range `[0, 1]`. + dist.subspace.gmlm <- dist.subspace(B.true, B.gmlm, normalize = TRUE) + dist.subspace.init <- dist.subspace(B.true, B.init, normalize = TRUE) + dist.subspace.pca <- dist.subspace(B.true, B.pca, normalize = TRUE) + dist.subspace.hopca <- dist.subspace(B.true, B.hopca, normalize = TRUE) + dist.subspace.tsir <- dist.subspace(B.true, B.tsir, normalize = TRUE) + dist.subspace.mgcca <- dist.subspace(B.true, B.mgcca, normalize = TRUE) + dist.subspace.hocca <- dist.subspace(B.true, B.hocca, normalize = TRUE) + + # Projection Distances: Spectral norm (2-norm) `|| P_A - P_B ||_2`. + # Equiv to Subspace distance in 1D reduction case + + # Call CSV logger writing results to file + logger() + + # print progress + cat(sprintf("sample size (%d): %d/%d - rep: %d/%d\n", + sample.size, which(sample.size == sample.sizes), + length(sample.sizes), rep, reps)) + } +} + + + +### read simulation results generate plots +if (!interactive()) { pdf(file = paste(base.name, "pdf", sep = ".")) } + +sim <- read.csv(log.file) + +plot.sim(sim, "dist.subspace", main = "Subspace Distance", + xlab = "Sample Size", ylab = "Distance") + +plot.sim(sim, "time", main = "Runtime", + xlab = "Sample Size", ylab = "Time [s]", ylim = c(0, 18)) diff --git a/sim/sim_1b_normal.R b/sim/sim_1b_normal.R new file mode 100644 index 0000000..26027c6 --- /dev/null +++ b/sim/sim_1b_normal.R @@ -0,0 +1,149 @@ +library(tensorPredictors) +# library(RGCCA) +### Load modified version which _does not_ create a clusster in case of +### `n_cores == 1` allowing huge speed improvements! (at least on Ubuntu 22.04 LTS) +### Moreover, it is compatible with `Rscript` +### Also added `Encoding: UTF-8` in `DESCRIPTION` +devtools::load_all("~/Work/tensorPredictors/References/Software/TGCCA-modified", export_all = FALSE) + + +setwd("~/Work/tensorPredictors/sim/") +base.name <- format(Sys.time(), "sim_1b_normal-%Y%m%dT%H%M") + +# Source utility function used in most simulations (extracted for convenience) +source("./sim_utils.R") + +# Set PRNG seed for reproducability +# Note: `0x` is the HEX number prefix and the trailing `L` stands for "long" +# which is `R`s way if indicating an integer. +set.seed(0x1bL, "Mersenne-Twister", "Inversion", "Rejection") + +### Simulation configuration +reps <- 100 # number of simulation replications +sample.sizes <- c(100, 200, 300, 500, 750) # sample sizes `n` +dimX <- c(2, 3, 5) # predictor `X` dimension +dimF <- rep(2, length(dimX)) # "function" `F(y)` of responce `y` dimension + +# setup true model parameters +betas <- Map(diag, 1, dimX, dimF) +Omegas <- Map(function(pj) 0.5^abs(outer(1:pj, 1:pj, `-`)), dimX) # AR(0.5) +eta1 <- 0 + +# data sampling routine +sample.data <- function(sample.size, eta1, betas, Omegas) { + # responce is a standard normal variable + y <- rnorm(sample.size) + # F(y) is a tensor of monomials + F <- sapply(y, function(yi) Reduce(outer, Map(`^`, yi, Map(seq, 0, len = dimF)))) + dim(F) <- c(dimF, sample.size) + + # sample predictors from tensor normal X | Y = y (last axis is sample axis) + sample.axis <- length(betas) + 1L + Deltas <- Map(solve, Omegas) # normal covariances + mu_y <- mlm(mlm(F, betas) + as.vector(eta1), Deltas) # conditional mean + X <- mu_y + rtensornorm(sample.size, 0, Deltas, sample.axis) # response + + list(X = X, F = F, y = y, sample.axis = sample.axis) +} + +# Create a CSV logger to write simulation results to +log.file <- paste(base.name, "csv", sep = ".") +logger <- CSV.logger( + file.name = log.file, + header = c("sample.size", "rep", outer( + c("time", "dist.subspace", "dist.projection"), # measures + c("gmlm", "pca", "hopca", "tsir", "mgcca"), # methods + paste, sep = "." + )) +) + +# compute true (full) model parameters to compair estimates against +B.true <- Reduce(`%x%`, rev(betas)) + +### for each sample size +for (sample.size in sample.sizes) { + # repeate every simulation + for (rep in seq_len(reps)) { + # Sample training data + c(X, F, y, sample.axis) %<-% sample.data(sample.size, eta1, betas, Omegas) + + # fit different models + # Wrapped in try-catch clock to ensure the simulation continues, + # if an error occures continue with nest resplication and log an error message + tryCatch({ + time.gmlm <- system.time( + fit.gmlm <- gmlm_tensor_normal(X, F, sample.axis = sample.axis) + )["user.self"] + time.pca <- system.time( + fit.pca <- prcomp(mat(X, sample.axis), rank. = prod(dimF)) + )["user.self"] + time.hopca <- system.time( + fit.hopca <- HOPCA(X, npc = dimF, sample.axis = sample.axis) + )["user.self"] + time.tsir <- system.time( + fit.tsir <- TSIR(X, y, dimF, sample.axis = sample.axis) + )["user.self"] + # `mgcca` expects the first axis to be the sample axis + X1 <- aperm(X, c(sample.axis, seq_along(dim(X))[-sample.axis])) + F1 <- aperm(F, c(sample.axis, seq_along(dim(X))[-sample.axis])) + time.mgcca <- system.time( + fit.mgcca <- mgcca( + list(X1, drop(F1)), # `drop` removes 1D axis + quiet = TRUE, + scheme = "factorial", + ncomp = rep(prod(dimF), 2) + ) + )["user.self"] + }, error = function(ex) { + print(ex) + }) + + # Compute true reduction matrix + B.gmlm <- with(fit.gmlm, Reduce(`%x%`, rev(betas))) + B.pca <- fit.pca$rotation + B.hopca <- Reduce(`%x%`, rev(fit.hopca)) + B.tsir <- Reduce(`%x%`, rev(fit.tsir)) + B.mgcca <- fit.mgcca$astar[[1]] + + # Subspace Distances: Normalized `|| P_A - P_B ||_F` where + # `P_A = A (A' A)^-1 A'` and the normalization means that with + # respect to the dimensions of `A, B` the subspace distance is in the + # range `[0, 1]`. + dist.subspace.gmlm <- dist.subspace(B.true, B.gmlm, normalize = TRUE) + dist.subspace.pca <- dist.subspace(B.true, B.pca, normalize = TRUE) + dist.subspace.hopca <- dist.subspace(B.true, B.hopca, normalize = TRUE) + dist.subspace.tsir <- dist.subspace(B.true, B.tsir, normalize = TRUE) + dist.subspace.mgcca <- dist.subspace(B.true, B.mgcca, normalize = TRUE) + + # Projection Distances: Spectral norm (2-norm) `|| P_A - P_B ||_2`. + dist.projection.gmlm <- dist.projection(B.true, B.gmlm) + dist.projection.pca <- dist.projection(B.true, B.pca) + dist.projection.hopca <- dist.projection(B.true, B.hopca) + dist.projection.tsir <- dist.projection(B.true, B.tsir) + dist.projection.mgcca <- dist.projection(B.true, B.mgcca) + + # Call CSV logger writing results to file + logger() + + # print progress + cat(sprintf("sample size (%d): %d/%d - rep: %d/%d\n", + sample.size, which(sample.size == sample.sizes), + length(sample.sizes), rep, reps)) + } +} + + + +### read simulation results generate plots +if (!interactive()) { pdf(file = paste(base.name, "pdf", sep = ".")) } + +sim <- read.csv(log.file) + +plot.sim(sim, "dist.subspace", main = "Subspace Distance", + xlab = "Sample Size", ylab = "Distance") + +plot.sim(sim, "dist.projection", main = "Projection Distance", + xlab = "Sample Size", ylab = "Distance") + +plot.sim(sim, "time", main = "Runtime", + xlab = "Sample Size", ylab = "Time [s]", ylim = c(0, 18)) diff --git a/sim/sim_1b_normal_2.R b/sim/sim_1b_normal_2.R new file mode 100644 index 0000000..1e15c15 --- /dev/null +++ b/sim/sim_1b_normal_2.R @@ -0,0 +1,185 @@ +library(tensorPredictors) +# library(RGCCA) +### Load modified version which _does not_ create a clusster in case of +### `n_cores == 1` allowing huge speed improvements! (at least on Ubuntu 22.04 LTS) +### Moreover, it is compatible with `Rscript` +### Also added `Encoding: UTF-8` in `DESCRIPTION` +devtools::load_all("~/Work/tensorPredictors/References/Software/TGCCA-modified", export_all = FALSE) + + +setwd("~/Work/tensorPredictors/sim/") +base.name <- format(Sys.time(), "sim_1b_normal-%Y%m%dT%H%M") + +# Source utility function used in most simulations (extracted for convenience) +source("./sim_utils.R") + +# Set PRNG seed for reproducability +# Note: `0x` is the HEX number prefix and the trailing `L` stands for "long" +# which is `R`s way if indicating an integer. +set.seed(0x1bL, "Mersenne-Twister", "Inversion", "Rejection") + +### Simulation configuration +reps <- 100 # number of simulation replications +sample.sizes <- c(100, 200, 300, 500, 750) # sample sizes `n` +validation.sizes <- 10000 +dimX <- c(2, 3, 5) # predictor `X` dimension +dimF <- rep(2, length(dimX)) # "function" `F(y)` of responce `y` dimension + +# setup true model parameters +betas <- Map(diag, 1, dimX, dimF) +Omegas <- Map(function(pj) 0.5^abs(outer(1:pj, 1:pj, `-`)), dimX) # AR(0.5) +eta1 <- 0 + +# data sampling routine +sample.data <- function(sample.size, eta1, betas, Omegas) { + # responce is a standard normal variable + y <- rnorm(sample.size) + # F(y) is a tensor of monomials + F <- sapply(y, function(yi) Reduce(outer, Map(`^`, yi, Map(seq, 0, len = dimF)))) + dim(F) <- c(dimF, sample.size) + + # sample predictors from tensor normal X | Y = y (last axis is sample axis) + sample.axis <- length(betas) + 1L + Deltas <- Map(solve, Omegas) # normal covariances + mu_y <- mlm(mlm(F, betas) + as.vector(eta1), Deltas) # conditional mean + X <- mu_y + rtensornorm(sample.size, 0, Deltas, sample.axis) # response + + list(X = X, F = F, y = y, sample.axis = sample.axis) +} + +# Create a CSV logger to write simulation results to +log.file <- paste(base.name, "csv", sep = ".") +logger <- CSV.logger( + file.name = log.file, + header = c("sample.size", "rep", outer( + c("dist.subspace", "dist.projection"), # measures + c("gmlm", "tsir", "hopca"), # methods + paste, sep = "." + ), outer( + c("time", "dist.min.subspace", "dist.min.projection", "reconst.error"), # measures + c("gmlm", "pca", "hopca", "tsir", "mgcca"), # methods + paste, sep = "." + )) +) + +# compute true (full) model parameters to compair estimates against +B.true <- Reduce(`%x%`, rev(betas)) +minimal <- function(B) { cbind( + "1" = B[, 1], + "y" = rowSums(B[, c(2, 3, 5)]), + "y^2" = rowSums(B[, c(4, 6, 7)]), + "y^3" = B[, 8] +) } +B.min.true <- minimal(B.true) + +### for each sample size +for (sample.size in sample.sizes) { + # repeate every simulation + for (rep in seq_len(reps)) { + # Sample training data + c(X, F, y, sample.axis) %<-% sample.data(sample.size, eta1, betas, Omegas) + + # fit different models + # Wrapped in try-catch clock to ensure the simulation continues, + # if an error occures continue with nest resplication and log an error message + tryCatch({ + time.gmlm <- system.time( + fit.gmlm <- gmlm_tensor_normal(X, F, sample.axis = sample.axis) + )["user.self"] + time.pca <- system.time( + fit.pca <- prcomp(mat(X, sample.axis), rank. = 4) + )["user.self"] + time.hopca <- system.time( + fit.hopca <- HOPCA(X, npc = dimF, sample.axis = sample.axis) + )["user.self"] + time.tsir <- system.time( + fit.tsir <- TSIR(X, y, dimF, sample.axis = sample.axis) + )["user.self"] + # `mgcca` expects the first axis to be the sample axis + X.perm <- aperm(X, c(sample.axis, seq_along(dim(X))[-sample.axis])) + F.min <- mat(F, sample.axis)[, c(2, 4, 8)] + time.mgcca <- system.time( + fit.mgcca <- mgcca( + list(X.perm, F.min), # `drop` removes 1D axis + quiet = TRUE, + scheme = "factorial", + ncomp = c(4, 1) + ) + )["user.self"] + }, error = function(ex) { + print(ex) + }) + + # Compute true reduction matrix + B.gmlm <- with(fit.gmlm, Reduce(`%x%`, rev(betas))) + B.hopca <- Reduce(`%x%`, rev(fit.hopca)) + B.tsir <- Reduce(`%x%`, rev(fit.tsir)) + + # and minimal true reductions if not already minimal + B.min.gmlm <- minimal(B.gmlm) + B.min.pca <- fit.pca$rotation + B.min.hopca <- B.hopca[, 1:4] + B.min.tsir <- La.svd(B.tsir, 4L, 0L)$u + B.min.mgcca <- fit.mgcca$astar[[1]] + + + # Subspace Distances: Normalized `|| P_A - P_B ||_F` where + # `P_A = A (A' A)^-1 A'` and the normalization means that with + # respect to the dimensions of `A, B` the subspace distance is in the + # range `[0, 1]`. + dist.subspace.gmlm <- dist.subspace(B.true, B.gmlm, normalize = TRUE) + dist.subspace.hopca <- dist.subspace(B.true, B.hopca, normalize = TRUE) + dist.subspace.tsir <- dist.subspace(B.true, B.tsir, normalize = TRUE) + + dist.min.subspace.gmlm <- dist.subspace(B.min.true, B.min.gmlm, normalize = TRUE) + dist.min.subspace.pca <- dist.subspace(B.min.true, B.min.pca, normalize = TRUE) + dist.min.subspace.hopca <- dist.subspace(B.min.true, B.min.hopca, normalize = TRUE) + dist.min.subspace.tsir <- dist.subspace(B.min.true, B.min.tsir, normalize = TRUE) + dist.min.subspace.mgcca <- dist.subspace(B.min.true, B.min.mgcca, normalize = TRUE) + + # Projection Distances: Spectral norm (2-norm) `|| P_A - P_B ||_2`. + dist.projection.gmlm <- dist.projection(B.true, B.gmlm) + dist.projection.hopca <- dist.projection(B.true, B.hopca) + dist.projection.tsir <- dist.projection(B.true, B.tsir) + + dist.min.projection.gmlm <- dist.projection(B.min.true, B.min.gmlm) + dist.min.projection.pca <- dist.projection(B.min.true, B.min.pca) + dist.min.projection.hopca <- dist.projection(B.min.true, B.min.hopca) + dist.min.projection.tsir <- dist.projection(B.min.true, B.min.tsir) + dist.min.projection.mgcca <- dist.projection(B.min.true, B.min.mgcca) + + # # Reconstruction error (MSE) of y given X with a new sample + # c(X, F, y, sample.axis) %<-% sample.data(validation.sizes, eta1, betas, Omegas) + # y.gmlm <- rowMeans(mat(mlm(X, fit.gmlm$betas), sample.axis)[, c(2, 3, 5)]) + + + # Call CSV logger writing results to file + logger() + + # print progress + cat(sprintf("sample size (%d): %d/%d - rep: %d/%d\n", + sample.size, which(sample.size == sample.sizes), + length(sample.sizes), rep, reps)) + } +} + + +### read simulation results and generate plots +if (!interactive()) { pdf(file = paste(base.name, "pdf", sep = ".")) } + +sim <- read.csv(log.file) + +plot.sim(sim, "dist.subspace", main = "Full Subspace Distance", + xlab = "Sample Size", ylab = "Distance") + +plot.sim(sim, "dist.min.subspace", main = "Min Subspace Distance", + xlab = "Sample Size", ylab = "Distance") + +plot.sim(sim, "dist.projection", main = "Full Projection Distance", + xlab = "Sample Size", ylab = "Distance") + +plot.sim(sim, "dist.min.projection", main = "Min Projection Distance", + xlab = "Sample Size", ylab = "Distance") + +plot.sim(sim, "time", main = "Runtime", + xlab = "Sample Size", ylab = "Time") diff --git a/sim/sim_1c_normal.R b/sim/sim_1c_normal.R new file mode 100644 index 0000000..f72c2b8 --- /dev/null +++ b/sim/sim_1c_normal.R @@ -0,0 +1,152 @@ +library(tensorPredictors) +# library(RGCCA) +### Load modified version which _does not_ create a clusster in case of +### `n_cores == 1` allowing huge speed improvements! (at least on Ubuntu 22.04 LTS) +### Moreover, it is compatible with `Rscript` +### Also added `Encoding: UTF-8` in `DESCRIPTION` +devtools::load_all("~/Work/tensorPredictors/References/Software/TGCCA-modified", export_all = FALSE) + + +setwd("~/Work/tensorPredictors/sim/") +base.name <- format(Sys.time(), "sim_1c_normal-%Y%m%dT%H%M") + +# Source utility function used in most simulations (extracted for convenience) +source("./sim_utils.R") + +# Set PRNG seed for reproducability +# Note: `0x` is the HEX number prefix and the trailing `L` stands for "long" +# which is `R`s way if indicating an integer. +set.seed(0x1cL, "Mersenne-Twister", "Inversion", "Rejection") + +### Simulation configuration +reps <- 100 # number of simulation replications +sample.sizes <- c(100, 200, 300, 500, 750) # sample sizes `n` +dimX <- c(2, 3, 5) # predictor `X` dimension +dimF <- rep(2, length(dimX)) # "function" `F(y)` of responce `y` dimension + +# setup true model parameters (rank 1 betas) +betas <- Map(function(nr, nc) { + tcrossprod((-1)^seq_len(nr), (-1)^seq_len(nc)) +}, dimX, dimF) +Omegas <- Map(function(pj) 0.5^abs(outer(1:pj, 1:pj, `-`)), dimX) # AR(0.5) +eta1 <- 0 + +# define projections onto rank 1 matrices for betas +proj.betas <- Map(.projRank, rep(1L, length(betas))) + +# data sampling routine +sample.data <- function(sample.size, eta1, betas, Omegas) { + # responce is a standard normal variable + y <- rnorm(sample.size) + # F(y) is a tensor of monomials + y.pow <- Reduce(function(a, b) outer(a, b, `+`), Map(seq, 0L, len = dimF)) + F <- t(outer(y, as.vector(y.pow), `^`)) + dim(F) <- c(dimF, sample.size) + + # sample predictors from tensor normal X | Y = y (last axis is sample axis) + sample.axis <- length(betas) + 1L + Deltas <- Map(solve, Omegas) # normal covariances + mu_y <- mlm(mlm(F, betas) + as.vector(eta1), Deltas) # conditional mean + X <- mu_y + rtensornorm(sample.size, 0, Deltas, sample.axis) # responses X + + list(X = X, F = F, y = y, sample.axis = sample.axis) +} + +# Create a CSV logger to write simulation results to +log.file <- paste(base.name, "csv", sep = ".") +logger <- CSV.logger( + file.name = log.file, + header = c("sample.size", "rep", outer( + c("time", "dist.subspace"), # measures + c("gmlm", "pca", "hopca", "tsir", "mgcca"), # methods + paste, sep = "." + )) +) + +# compute true (full) model parameters to compair estimates against +B.true <- Reduce(`%x%`, rev(betas))[, 1L, drop = FALSE] + +### for each sample size +for (sample.size in sample.sizes) { + # repeate every simulation + for (rep in seq_len(reps)) { + # Sample training data + c(X, F, y, sample.axis) %<-% sample.data(sample.size, eta1, betas, Omegas) + + # fit different models + # Wrapped in try-catch clock to ensure the simulation continues, + # if an error occures continue with nest resplication and log an error message + tryCatch({ + time.gmlm <- system.time( + fit.gmlm <- gmlm_tensor_normal(X, F, sample.axis = sample.axis, + proj.betas = proj.betas) + )["user.self"] + time.pca <- system.time( + fit.pca <- prcomp(mat(X, sample.axis), rank. = 1L) + )["user.self"] + time.hopca <- system.time( + fit.hopca <- HOPCA(X, npc = c(1L, 1L, 1L), sample.axis = sample.axis) + )["user.self"] + time.tsir <- system.time( + fit.tsir <- TSIR(X, y, d = c(1L, 1L, 1L), sample.axis = sample.axis) + )["user.self"] + # `mgcca` expects the first axis to be the sample axis + X1 <- aperm(X, c(sample.axis, seq_along(dim(X))[-sample.axis])) + F1 <- cbind(y, y^2, y^3) + time.mgcca <- system.time( + fit.mgcca <- mgcca( + list(X1, F1), # `drop` removes 1D axis + quiet = TRUE, + scheme = "factorial", + ncomp = c(1L, 1L) + ) + )["user.self"] + }, error = function(ex) { + print(ex) + }) + + # Compute true reduction matrix + B.gmlm <- Reduce(kronecker, Map( + function(beta) qr.Q(qr(beta))[, 1L, drop = FALSE], + rev(fit.gmlm$betas))) + B.pca <- fit.pca$rotation + B.hopca <- Reduce(`%x%`, rev(fit.hopca)) + B.tsir <- Reduce(`%x%`, rev(fit.tsir)) + B.mgcca <- fit.mgcca$astar[[1]] + + # Subspace Distances: Normalized `|| P_A - P_B ||_F` where + # `P_A = A (A' A)^-1 A'` and the normalization means that with + # respect to the dimensions of `A, B` the subspace distance is in the + # range `[0, 1]`. + dist.subspace.gmlm <- dist.subspace(B.true, B.gmlm, normalize = TRUE) + dist.subspace.pca <- dist.subspace(B.true, B.pca, normalize = TRUE) + dist.subspace.hopca <- dist.subspace(B.true, B.hopca, normalize = TRUE) + dist.subspace.tsir <- dist.subspace(B.true, B.tsir, normalize = TRUE) + dist.subspace.mgcca <- dist.subspace(B.true, B.mgcca, normalize = TRUE) + + # No projection distacne as in this case the Subspace and Projection + # distances are identical + + # Call CSV logger writing results to file + logger() + + # print progress + cat(sprintf("sample size (%d): %d/%d - rep: %d/%d\n", + sample.size, which(sample.size == sample.sizes), + length(sample.sizes), rep, reps)) + } +} + + +### read simulation results generate plots +if (!interactive()) { pdf(file = paste(base.name, "pdf", sep = ".")) } + +sim <- read.csv(log.file) + +plot.sim(sim, "dist.subspace", main = "Subspace Distance", + xlab = "Sample Size", ylab = "Distance") + +plot.sim(sim, "time", main = "Runtime", + xlab = "Sample Size", ylab = "Time [s]", ylim = c(0, 18)) + +# aggr <- aggregate(sim[, names(sim) != "sample.size"], list(sample.size = sim$sample.size), mean) diff --git a/sim/sim_1d_normal.R b/sim/sim_1d_normal.R new file mode 100644 index 0000000..0e3ad4a --- /dev/null +++ b/sim/sim_1d_normal.R @@ -0,0 +1,157 @@ +library(tensorPredictors) +# library(RGCCA) +### Load modified version which _does not_ create a clusster in case of +### `n_cores == 1` allowing huge speed improvements! (at least on Ubuntu 22.04 LTS) +### Moreover, it is compatible with `Rscript` +### Also added `Encoding: UTF-8` in `DESCRIPTION` +devtools::load_all("~/Work/tensorPredictors/References/Software/TGCCA-modified", export_all = FALSE) + + +setwd("~/Work/tensorPredictors/sim/") +base.name <- format(Sys.time(), "sim_1d_normal-%Y%m%dT%H%M") + +# Source utility function used in most simulations (extracted for convenience) +source("./sim_utils.R") + +# Set PRNG seed for reproducability +# Note: `0x` is the HEX number prefix and the trailing `L` stands for "long" +# which is `R`s way if indicating an integer. +set.seed(0x1dL, "Mersenne-Twister", "Inversion", "Rejection") + +### Simulation configuration +reps <- 100 # number of simulation replications +sample.sizes <- c(100, 200, 300, 500, 750) # sample sizes `n` +dimX <- c(2, 3, 5) # predictor `X` dimension +dimF <- rep(2, length(dimX)) # "function" `F(y)` of responce `y` dimension + +# setup true model parameters (all rank 1 betas) +betas <- Map(diag, 1, dimX, dimF) +Omegas <- Map(function(pj) 0.5^abs(outer(1:pj, 1:pj, `-`)), dimX) # AR(0.5) +eta1 <- 0 + +# define projections onto tri-diagonal matrixes +proj.Omegas <- Map(.projBand, Map(dim, Omegas), 1L, 1L) +# and project Omegas +Omegas <- Map(do.call, proj.Omegas, Map(list, Omegas)) + +# data sampling routine +sample.data <- function(sample.size, eta1, betas, Omegas) { + # responce is a standard normal variable + y <- rnorm(sample.size) + # F(y) is a tensor of monomials + F <- sapply(y, function(yi) Reduce(outer, Map(`^`, yi, Map(seq, 0, len = dimF)))) + dim(F) <- c(dimF, sample.size) + + # sample predictors from tensor normal X | Y = y (last axis is sample axis) + sample.axis <- length(betas) + 1L + Deltas <- Map(solve, Omegas) # normal covariances + mu_y <- mlm(mlm(F, betas) + as.vector(eta1), Deltas) # conditional mean + X <- mu_y + rtensornorm(sample.size, 0, Deltas, sample.axis) # responses X + + list(X = X, F = F, y = y, sample.axis = sample.axis) +} + +# Create a CSV logger to write simulation results to +log.file <- paste(base.name, "csv", sep = ".") +logger <- CSV.logger( + file.name = log.file, + header = c("sample.size", "rep", outer( + c("time", "dist.subspace", "dist.projection"), # measures + c("gmlm", "pca", "hopca", "tsir", "mgcca"), # methods + paste, sep = "." + )) +) + +# compute true (full) model parameters to compair estimates against +B.true <- Reduce(`%x%`, rev(betas)) + +### for each sample size +for (sample.size in sample.sizes) { + # repeate every simulation + for (rep in seq_len(reps)) { + # Sample training data + c(X, F, y, sample.axis) %<-% sample.data(sample.size, eta1, betas, Omegas) + + # fit different models + # Wrapped in try-catch clock to ensure the simulation continues, + # if an error occures continue with nest resplication and log an error message + try.catch.block <- tryCatch({ + time.gmlm <- system.time( + fit.gmlm <- gmlm_tensor_normal(X, F, sample.axis = sample.axis, + proj.Omegas = proj.Omegas) + )["user.self"] + time.pca <- system.time( + fit.pca <- prcomp(mat(X, sample.axis), rank. = prod(dimF)) + )["user.self"] + time.hopca <- system.time( + fit.hopca <- HOPCA(X, npc = dimF, sample.axis = sample.axis) + )["user.self"] + time.tsir <- system.time( + fit.tsir <- TSIR(X, y, dimF, sample.axis = sample.axis) + )["user.self"] + # `mgcca` expects the first axis to be the sample axis + X1 <- aperm(X, c(sample.axis, seq_along(dim(X))[-sample.axis])) + F1 <- cbind(y, y^2, y^3) + time.mgcca <- system.time( + fit.mgcca <- mgcca( + list(X1, F1), + quiet = TRUE, + scheme = "factorial", + ncomp = c(prod(dimF), 1L) + ) + )["user.self"] + }, error = print) + + # Drop comparison in case any error (in any fitting routine) + if (inherits(try.catch.block, "error")) { next } + + # Compute true reduction matrix + B.gmlm <- with(fit.gmlm, Reduce(`%x%`, rev(betas))) + B.pca <- fit.pca$rotation + B.hopca <- Reduce(`%x%`, rev(fit.hopca)) + B.tsir <- Reduce(`%x%`, rev(fit.tsir)) + B.mgcca <- fit.mgcca$astar[[1]] + + # Subspace Distances: Normalized `|| P_A - P_B ||_F` where + # `P_A = A (A' A)^-1 A'` and the normalization means that with + # respect to the dimensions of `A, B` the subspace distance is in the + # range `[0, 1]`. + dist.subspace.gmlm <- dist.subspace(B.true, B.gmlm, normalize = TRUE) + dist.subspace.pca <- dist.subspace(B.true, B.pca, normalize = TRUE) + dist.subspace.hopca <- dist.subspace(B.true, B.hopca, normalize = TRUE) + dist.subspace.tsir <- dist.subspace(B.true, B.tsir, normalize = TRUE) + dist.subspace.mgcca <- dist.subspace(B.true, B.mgcca, normalize = TRUE) + + # Projection Distances: Spectral norm (2-norm) `|| P_A - P_B ||_2`. + dist.projection.gmlm <- dist.projection(B.true, B.gmlm) + dist.projection.pca <- dist.projection(B.true, B.pca) + dist.projection.hopca <- dist.projection(B.true, B.hopca) + dist.projection.tsir <- dist.projection(B.true, B.tsir) + dist.projection.mgcca <- dist.projection(B.true, B.mgcca) + + # Call CSV logger writing results to file + logger() + + # print progress + cat(sprintf("sample size (%d): %d/%d - rep: %d/%d\n", + sample.size, which(sample.size == sample.sizes), + length(sample.sizes), rep, reps)) + } +} + + + + +### read simulation results generate plots +if (!interactive()) { pdf(file = paste(base.name, "pdf", sep = ".")) } + +sim <- read.csv(log.file) + +plot.sim(sim, "dist.subspace", main = "Subspace Distance", + xlab = "Sample Size", ylab = "Distance") + +plot.sim(sim, "dist.projection", main = "Projection Distance", + xlab = "Sample Size", ylab = "Distance") + +plot.sim(sim, "time", main = "Runtime", + xlab = "Sample Size", ylab = "Time [s]", ylim = c(0, 18)) diff --git a/sim/sim_1e_normal.R b/sim/sim_1e_normal.R new file mode 100644 index 0000000..6eb2544 --- /dev/null +++ b/sim/sim_1e_normal.R @@ -0,0 +1,231 @@ +library(tensorPredictors) +# library(RGCCA) # for `mgcca` +### Load modified version which _does not_ create a clusster in case of +### `n_cores == 1` allowing huge speed improvements! (at least on Ubuntu 22.04 LTS) +### Moreover, it is compatible with `Rscript` +devtools::load_all("~/Work/tensorPredictors/References/Software/TGCCA-modified", export_all = FALSE) + + +setwd("~/Work/tensorPredictors/sim/") +base.name <- format(Sys.time(), "sim_1e_normal-%Y%m%dT%H%M") + +# Source utility function used in most simulations (extracted for convenience) +source("./sim_utils.R") + +# Set PRNG seed for reproducability +# Note: `0x` is the HEX number prefix and the trailing `L` stands for "long" +# which is `R`s way if indicating an integer. +set.seed(0x1eL, "Mersenne-Twister", "Inversion", "Rejection") + +### Simulation configuration +reps <- 100 # number of simulation replications +sample.sizes <- c(100, 200, 300, 500, 750) # sample sizes `n` +dimX <- c(5, 5) # predictor `X` dimension +dimF <- c(2, 2) + + +Sigma <- 0.5^abs(outer(1:prod(dimX), 1:prod(dimX), `-`)) # AR(0.5) + +# # define projections onto tri-diagonal matrixes +# proj.betas <- Map(.projRank, rep(1L, length(dimX))) # wrong assumption of low rank betas +# proj.Omegas <- Map(.projBand, Map(c, dimX, dimX), 1L, 1L) # wrong assumption of band Scatter matrices + +# data sampling routine +sample.data <- function(sample.size, dimX, dimF, B.true, Sigma) { + y <- rnorm(sample.size) + + # the true F (in vectorized form) + vecF <- rbind(1, sin(y), cos(y), sin(y) * cos(y)) + + # sample vectorized X as a multi-variate normal (in vectorized form) + vecX <- B.true %*% vecF + t(chol(Sigma)) %*% matrix(rnorm(prod(sample.size, dimX)), prod(dimX)) + X <- array(vecX, c(dimX, sample.size)) + + list(X, vecF, y, length(dim(X))) +} + +# wrong assumption about the function `F(y)` +F.wrong <- function(y) array(rbind(1, y, y, y^2), c(2, 2, length(y))) + + +# Create a CSV logger to write simulation results to +log.file <- paste(base.name, "csv", sep = ".") +logger <- CSV.logger( + file.name = log.file, + header = c("sample.size", "rep", outer( + c("time", "dist.subspace", "dist.projection"), # measures + c("gmlm", "pca", "hopca", "tsir", "mgcca"), # methods + paste, sep = "." + )) +) + +# Miss-specify true beta as _not_ a kronecker product +B.true <- diag(1, prod(dimX), prod(dimF)) + +### for each sample size +for (sample.size in sample.sizes) { + # repeate every simulation + for (rep in seq_len(reps)) { + # Sample training data + c(X, F.true, y, sample.axis) %<-% sample.data(sample.size, dimX, dimF, B.true, Sigma) + + # fit different models + # Wrapped in try-catch clock to ensure the simulation continues, + # if an error occures continue with nest resplication and log an error message + tryCatch({ + time.gmlm <- system.time( + fit.gmlm <- gmlm_tensor_normal(X, F.wrong(y), sample.axis = sample.axis) + )["user.self"] + time.pca <- system.time( + fit.pca <- prcomp(mat(X, sample.axis), rank. = prod(dimF)) + )["user.self"] + time.hopca <- system.time( + fit.hopca <- HOPCA(X, npc = dimF, sample.axis = sample.axis) + )["user.self"] + time.tsir <- system.time( + fit.tsir <- TSIR(X, y, dimF, sample.axis = sample.axis) + )["user.self"] + # `mgcca` expects the first axis to be the sample axis + X1 <- aperm(X, c(sample.axis, seq_along(dim(X))[-sample.axis])) + time.mgcca <- system.time( + fit.mgcca <- mgcca( + list(X1, y), + quiet = TRUE, + scheme = "factorial", + ncomp = c(prod(dimF), 1), + ranks = c(prod(dimF), 1) + ) + )["user.self"] + }, error = function(ex) { + print(ex) + }) + + # Compute true reduction matrix + B.gmlm <- with(fit.gmlm, Reduce(`%x%`, rev(betas))) + B.pca <- fit.pca$rotation + B.hopca <- Reduce(`%x%`, rev(fit.hopca)) + B.tsir <- Reduce(`%x%`, rev(fit.tsir)) + B.mgcca <- fit.mgcca$astar[[1]] + + # Subspace Distances: Normalized `|| P_A - P_B ||_F` where + # `P_A = A (A' A)^-1 A'` and the normalization means that with + # respect to the dimensions of `A, B` the subspace distance is in the + # range `[0, 1]`. + dist.subspace.gmlm <- dist.subspace(B.true, B.gmlm, normalize = TRUE) + dist.subspace.pca <- dist.subspace(B.true, B.pca, normalize = TRUE) + dist.subspace.hopca <- dist.subspace(B.true, B.hopca, normalize = TRUE) + dist.subspace.tsir <- dist.subspace(B.true, B.tsir, normalize = TRUE) + dist.subspace.mgcca <- dist.subspace(B.true, B.mgcca, normalize = TRUE) + + # Projection Distances: Spectral norm (2-norm) `|| P_A - P_B ||_2`. + dist.projection.gmlm <- dist.projection(B.true, B.gmlm) + dist.projection.pca <- dist.projection(B.true, B.pca) + dist.projection.hopca <- dist.projection(B.true, B.hopca) + dist.projection.tsir <- dist.projection(B.true, B.tsir) + dist.projection.mgcca <- dist.projection(B.true, B.mgcca) + + # Call CSV logger writing results to file + logger() + + # print progress + cat(sprintf("sample size (%d): %d/%d - rep: %d/%d\n", + sample.size, which(sample.size == sample.sizes), + length(sample.sizes), rep, reps)) + } +} + + +### read simulation results generate plots +if (!interactive()) { pdf(file = paste(base.name, "pdf", sep = ".")) } + +sim <- read.csv(log.file) + +# Remain sample size grouping variable to avoid conflicts +aggr.mean <- aggregate(sim, list(sampleSize = sim$sample.size), mean) +aggr.median <- aggregate(sim, list(sampleSize = sim$sample.size), median) +aggr.sd <- aggregate(sim, list(sampleSize = sim$sample.size), sd) +aggr.min <- aggregate(sim, list(sampleSize = sim$sample.size), min) +aggr.max <- aggregate(sim, list(sampleSize = sim$sample.size), max) + +par(pch = 16, bty = "n", lty = "solid", lwd = 1.5) +cols <- c(gmlm = "blue", pca = "darkcyan", hopca = "red", tsir = "darkgreen", + mgcca = "purple") + +with(aggr.mean, { + plot(range(sampleSize), c(0, 1), type = "n", + main = "Subspace Distance", + xlab = "Sample Size", + ylab = "Distance" + ) + for (dist.name in ls(pattern = "^dist.subspace")) { + mean <- get(dist.name) + median <- aggr.median[aggr.sd$sampleSize == sampleSize, dist.name] + sd <- aggr.sd[aggr.sd$sampleSize == sampleSize, dist.name] + min <- aggr.min[aggr.sd$sampleSize == sampleSize, dist.name] + max <- aggr.max[aggr.sd$sampleSize == sampleSize, dist.name] + method <- tail(strsplit(dist.name, ".", fixed = TRUE)[[1]], 1) + col <- cols[method] + lines(sampleSize, mean, type = "o", col = col, lty = 1, lwd = 1) + lines(sampleSize, mean + sd, col = col, lty = 2, lwd = 0.8) + lines(sampleSize, mean - sd, col = col, lty = 2, lwd = 0.8) + lines(sampleSize, median, col = col, lty = 1, lwd = 1) + lines(sampleSize, min, col = col, lty = 3, lwd = 0.6) + lines(sampleSize, max, col = col, lty = 3, lwd = 0.6) + } + + legend("topright", col = cols, lty = 1, legend = names(cols), + bty = "n", lwd = par("lwd"), pch = par("pch")) +}) + +with(aggr.mean, { + plot(range(sampleSize), c(0, 1), type = "n", + main = "Projection Distance", + xlab = "Sample Size", + ylab = "Distance" + ) + for (dist.name in ls(pattern = "^dist.projection")) { + mean <- get(dist.name) + median <- aggr.median[aggr.sd$sampleSize == sampleSize, dist.name] + sd <- aggr.sd[aggr.sd$sampleSize == sampleSize, dist.name] + min <- aggr.min[aggr.sd$sampleSize == sampleSize, dist.name] + max <- aggr.max[aggr.sd$sampleSize == sampleSize, dist.name] + method <- tail(strsplit(dist.name, ".", fixed = TRUE)[[1]], 1) + col <- cols[method] + lines(sampleSize, mean, type = "o", col = col, lty = 1, lwd = 1) + lines(sampleSize, mean + sd, col = col, lty = 2, lwd = 0.8) + lines(sampleSize, mean - sd, col = col, lty = 2, lwd = 0.8) + lines(sampleSize, median, col = col, lty = 1, lwd = 1) + lines(sampleSize, min, col = col, lty = 3, lwd = 0.6) + lines(sampleSize, max, col = col, lty = 3, lwd = 0.6) + } + + legend("topright", col = cols, lty = 1, legend = names(cols), + bty = "n", lwd = par("lwd"), pch = par("pch")) +}) + + + +# sample.axis <- 1L +# F.wrong <- array(outer(y, c(0, 1, 1, 2, 1, 2, 2, 3), `^`), c(10, 2, 2, 2)) + + +# F.true <- array(c(1, sin(y), cos(y))) + + +# 1 s c +# s ss cs +# c sc cc + +# s ss cs +# ss sss css +# cs scs ccs + +# c sc cc +# sc ssc csc +# cc scc ccc + +# osc <- + +# I <- array(0, c(3, 3, 3)) + +# slice.index() diff --git a/sim/sim_2a_ising.R b/sim/sim_2a_ising.R new file mode 100644 index 0000000..6abcba7 --- /dev/null +++ b/sim/sim_2a_ising.R @@ -0,0 +1,315 @@ +# library(tensorPredictors) +devtools::load_all("~/Work/tensorPredictors/tensorPredictors", export_all = FALSE) + +library(logisticPCA) +# library(RGCCA) +# Use modified version of `RGCCA` +# Reasons (on Ubuntu 22.04 LTS): +# - compatible with `Rscript` +# - about 4 times faster for small problems +# Changes: +# - Run in main thread, avoid `parallel::makeCluster` if `n_cores == 1` +# (file "R/mgccak.R" lines 81:103) +# - added `Encoding: UTF-8` +# (file "DESCRIPTION") +suppressWarnings({ + devtools::load_all("~/Work/tensorPredictors/References/Software/TGCCA-modified", export_all = FALSE) +}) + +setwd("~/Work/tensorPredictors/sim/") +base.name <- format(Sys.time(), "sim_2a_ising-%Y%m%dT%H%M") + +# Source utility function used in most simulations (extracted for convenience) +source("./sim_utils.R") + +# Set PRNG seed for reproducability +# Note: `0x` is the HEX number prefix and the trailing `L` stands for "long" +# which is `R`s way if indicating an integer. +set.seed(0x2aL, "Mersenne-Twister", "Inversion", "Rejection") + + +reps <- 100 # number of simulation replications +sample.sizes <- c(100, 200, 300, 500, 750) # sample sizes `n` +dimX <- c(2, 3) # predictor `X` dimension +dimF <- rep(1, length(dimX)) # "function" `F(y)` of responce `y` dimension + +betas <- Map(diag, 1, dimX, dimF) +Omegas <- list(toeplitz(c(0, -2)), toeplitz(seq(1, 0, by = -0.5))) + + +# data sampling routine +sample.data <- function(sample.size, betas, Omegas) { + dimX <- mapply(nrow, betas) + dimF <- mapply(ncol, betas) + + # generate response (sample axis is last axis) + y <- runif(prod(sample.size, dimF), -2, 2) + F <- array(y, dim = c(dimF, sample.size)) # ~ U[-1, 1] + + Omega <- Reduce(kronecker, rev(Omegas)) + + X <- apply(F, length(dim(F)), function(Fi) { + dim(Fi) <- dimF + params <- diag(as.vector(mlm(Fi, betas))) + Omega + tensorPredictors::ising_sample(1, params) + }) + dim(X) <- c(dimX, sample.size) + + list(X = X, F = F, y = y, sample.axis = length(dim(X))) +} + +lpca.hyper.param <- local({ + c(X, F, y, sample.axis) %<-% sample.data(1e3, betas, Omegas) + vecX <- mat(X, sample.axis) + CV <- cv.lpca(vecX, ks = prod(dimF), ms = seq(1, 20, by = 0.5)) + # plot(CV) + as.numeric(colnames(CV))[which.min(CV)] +}) + + +# Create a CSV logger to write simulation results to +log.file <- paste(base.name, "csv", sep = ".") +logger <- CSV.logger( + file.name = log.file, + header = c("sample.size", "rep", outer( + c("time", "dist.subspace"), # measures + c("gmlm", "tnormal", "pca", "hopca", "lpca", "clpca", "tsir", "mgcca"), # methods + paste, sep = "." + )) +) + + + + +# compute true (full) model parameters to compair estimates against +B.true <- Reduce(`%x%`, rev(betas)) + +### for each sample size +for (sample.size in sample.sizes) { + # repeate every simulation + for (rep in seq_len(reps)) { + # Sample training data + c(X, F, y, sample.axis) %<-% sample.data(sample.size, betas, Omegas) + + # fit different models + # Wrapped in try-catch clock to ensure the simulation continues, + # if an error occures continue with nest resplication and log an error message + try.catch.block <- tryCatch({ + time.gmlm <- system.time( + fit.gmlm <- gmlm_ising(X, F, sample.axis = sample.axis) + )["user.self"] + time.tnormal <- -1 # part of Ising gmlm (not relevent here) + time.pca <- system.time( + fit.pca <- prcomp(mat(X, sample.axis), rank. = prod(dimF)) + )["user.self"] + time.hopca <- system.time( + fit.hopca <- HOPCA(X, npc = dimF, sample.axis = sample.axis) + )["user.self"] + time.lpca <- system.time( + fit.lpca <- logisticPCA(mat(X, sample.axis), k = prod(dimF), m = lpca.hyper.param) + )["user.self"] + time.clpca <- system.time( + fit.clpca <- convexLogisticPCA(mat(X, sample.axis), k = prod(dimF), m = lpca.hyper.param) + )["user.self"] + time.tsir <- system.time( + fit.tsir <- TSIR(X, y, dimF, sample.axis = sample.axis) + )["user.self"] + # `mgcca` expects the first axis to be the sample axis + X1 <- aperm(X, c(sample.axis, seq_along(dim(X))[-sample.axis])) + time.mgcca <- system.time( + fit.mgcca <- mgcca( + list(X1, y), # `drop` removes 1D axis + quiet = TRUE, + scheme = "factorial", + ncomp = c(1, 1) + ) + )["user.self"] + }, error = print) + + # Drop comparison in case any error (in any fitting routine) + if (inherits(try.catch.block, "error")) { next } + + # Compute true reduction matrix + B.gmlm <- with(fit.gmlm, Reduce(`%x%`, rev(betas))) + B.tnormal <- with(attr(fit.gmlm, "tensor_normal"), Reduce(`%x%`, rev(betas))) + B.pca <- fit.pca$rotation + B.hopca <- Reduce(`%x%`, rev(fit.hopca)) + B.lpca <- fit.lpca$U + B.clpca <- fit.clpca$U + B.tsir <- Reduce(`%x%`, rev(fit.tsir)) + B.mgcca <- fit.mgcca$astar[[1]] + + # Subspace Distances: Normalized `|| P_A - P_B ||_F` where + # `P_A = A (A' A)^-1 A'` and the normalization means that with + # respect to the dimensions of `A, B` the subspace distance is in the + # range `[0, 1]`. + dist.subspace.gmlm <- dist.subspace(B.true, B.gmlm, normalize = TRUE) + dist.subspace.tnormal <- dist.subspace(B.true, B.tnormal, normalize = TRUE) + dist.subspace.pca <- dist.subspace(B.true, B.pca, normalize = TRUE) + dist.subspace.hopca <- dist.subspace(B.true, B.hopca, normalize = TRUE) + dist.subspace.lpca <- dist.subspace(B.true, B.lpca, normalize = TRUE) + dist.subspace.clpca <- dist.subspace(B.true, B.clpca, normalize = TRUE) + dist.subspace.tsir <- dist.subspace(B.true, B.tsir, normalize = TRUE) + dist.subspace.mgcca <- dist.subspace(B.true, B.mgcca, normalize = TRUE) + + # # Projection Distances: Spectral norm (2-norm) `|| P_A - P_B ||_2`. + # equiv to Subspace distance in this case + + # Call CSV logger writing results to file + logger() + + # print progress + cat(sprintf("sample size (%d): %d/%d - rep: %d/%d\n", + sample.size, which(sample.size == sample.sizes), + length(sample.sizes), rep, reps)) + } +} + + +### read simulation results generate plots +if (!interactive()) { pdf(file = paste(base.name, "pdf", sep = ".")) } + +sim <- read.csv(log.file) + + +plot.sim(sim, "dist.subspace", main = "Subspace Distance", + xlab = "Sample Size", ylab = "Distance") + +# plot.sim(sim, "dist.projection", main = "Projection Distance", +# xlab = "Sample Size", ylab = "Distance") + +plot.sim(sim, "time", main = "Runtime", + xlab = "Sample Size", ylab = "Time [s]", ylim = c(0, 18)) + +# aggr <- aggregate(sim, list(sim$sample.size), sd) +# stats <- aggr[, c(2, 5, 7, 9, 11, 13, 15, 17, 19)] + +# names(stats) <- Map(tail, strsplit(names(stats), ".", fixed = TRUE), 1) +# round(stats * 100, 2) + +# sim <- sim[!startsWith(names(sim), "time")] +# sim <- sim[names(sim) != "rep"] +# names(sim) <- strsplit(names(stats), ".", fixed = TRUE) +# (as.data.frame(Map(function(m, s) { +# paste0(round(m, 2), " (", round(s, 2), ")") +# }, +# aggregate(sim, list(sim$size), mean), +# aggregate(sim, list(sim$size), sd) +# ))) + +# $n$ & gmlm & pca & hopca lpca & clpca & tsir & mgcca +# 100 & 0.34 (0.14) & 0.90 (0.04) & 0.90 (0.05) 0.94 (0.09) & 1 0.91 (0.03) & 0.48 (0.19) & 0.55 (0.13) +# 200 & 0.25 (0.11) & 0.90 (0.03) & 0.90 (0.03) 0.96 (0.07) & 2 0.91 (0.02) & 0.38 (0.16) & 0.53 (0.10) +# 300 & 0.20 (0.09) & 0.89 (0.02) & 0.89 (0.02) 0.97 (0.06) & 3 0.91 (0.02) & 0.29 (0.13) & 0.51 (0.11) +# 500 & 0.16 (0.07) & 0.90 (0.02) & 0.90 (0.02) 0.98 (0.01) & 4 0.91 (0.01) & 0.23 (0.10) & 0.50 (0.08) +# 750 & 0.13 (0.05) & 0.90 (0.01) & 0.90 (0.01) 0.98 (0.02) & 5 0.91 (0.01) & 0.23 (0.08) & 0.53 (0.06) + + + + + + + + + + + + +if (FALSE) { +################################################################################ +### Work In Progress ### +################################################################################ +library(tensorPredictors) + +dimX <- c(3, 3, 3) +dimF <- c(1, 1, 1) + +betas <- Map(diag, 1, dimX, dimF) +Omegas <- rev(list( + toeplitz(-1 * (seq_len(dimX[1]) == 2L)), + toeplitz(seq(1, 0, len = dimX[2])), + diag(dimX[3]) +)) +Omega <- Reduce(kronecker, rev(Omegas)) + +layout(matrix(c( + 1, 3, 4, + 2, 3, 5, + 6, 6, 6 +), nrow = 3, byrow = TRUE), heights = c(8, 8, 1)) + + +`E(X |` <- function(Y) { + array(diag(ising_m2(diag(as.vector(mlm(array(Y, dimF), betas))) + Omega)), dimX) +} +`E(X |`(Y = -2) +`E(X |`(Y = +2) + +col <- hcl.colors(256, "Blue-Red 3", rev = FALSE) +matrixImage(`E(X |`(Y = -2), main = "E[X | Y = -2]", zlim = c(0, 1), col = col) +matrixImage(`E(X |`(Y = -1), main = "E[X | Y = -1]", zlim = c(0, 1), col = col) +matrixImage(`E(X |`(Y = 0), main = "E[X | Y = 0]", zlim = c(0, 1), col = col) +matrixImage(`E(X |`(Y = +1), main = "E[X | Y = +1]", zlim = c(0, 1), col = col) +matrixImage(`E(X |`(Y = +2), main = "E[X | Y = +2]", zlim = c(0, 1), col = col) +{ + restor.par <- par(mar = c(1.1, 2.1, 0, 2.1)) + plot(0:1, 0:1, type = "n", xlab = "", ylab = "", axes = FALSE) + rasterImage(as.raster(matrix(col, nrow = 1)), 0, 0, 1, 1) + mtext("0", side = 2, las = 1, line = -3) + mtext("1", side = 4, las = 1, line = -3) + par(restor.par) +} + + + + + +sample.size <- 100 +c(X, F, y, sample.axis) %<-% sample.data(sample.size, betas, Omegas) +# Design matrix containing vectorized X's +vecX <- mat(X, sample.axis) + + +fit.gmlm <- gmlm_ising(X, F) +fit.pca <- prcomp(mat(X, sample.axis), rank. = prod(dimF)) +fit.hopca <- HOPCA(X, npc = dimF, sample.axis = sample.axis) +fit.tsir <- TSIR(X, y, dimF, sample.axis = sample.axis) +fit.mgcca <- local({ + X1 <- aperm(X, c(sample.axis, seq_along(dim(X))[-sample.axis])) + F1 <- aperm(F, c(sample.axis, seq_along(dim(X))[-sample.axis])) + mgcca( + list(X1, drop(F1)), # `drop` removes 1D axis + quiet = TRUE, + scheme = "factorial", + ncomp = rep(prod(dimF), 2) + ) +}) +fit.lpca <- logisticPCA(vecX, k = prod(dimF), m = m) +fit.clpca <- convexLogisticPCA(vecX, k = prod(dimF), m = m) + + +B.gmlm <- Reduce(kronecker, rev(fit.gmlm$betas)) +B.pca <- fit.pca$rotation +B.hopca <- Reduce(`%x%`, rev(fit.hopca)) +B.tsir <- Reduce(`%x%`, rev(fit.tsir)) +B.mgcca <- fit.mgcca$astar[[1]] +B.lpca <- fit.lpca$U +B.clpca <- fit.clpca$U +# B.lsvd <- ??? + + + +dist.subspace(B.true, B.gmlm, normalize = TRUE) +dist.subspace(B.true, B.pca, normalize = TRUE) +dist.subspace(B.true, B.hopca, normalize = TRUE) +dist.subspace(B.true, B.tsir, normalize = TRUE) +dist.subspace(B.true, B.mgcca, normalize = TRUE) +dist.subspace(B.true, B.lpca, normalize = TRUE) +dist.subspace(B.true, B.clpca, normalize = TRUE) + + + +################################################################################ +### End - Work In Progress ### +################################################################################ +}