tensor_predictors/sim/sim_1c_normal.R

197 lines
7.1 KiB
R

library(tensorPredictors)
# library(RGCCA)
### Load modified version which _does not_ create a clusster in case of
### `n_cores == 1` allowing huge speed improvements! (at least on Ubuntu 22.04 LTS)
### Moreover, it is compatible with `Rscript`
### Also added `Encoding: UTF-8` in `DESCRIPTION`
devtools::load_all("~/Work/tensorPredictors/References/Software/TGCCA-modified", export_all = FALSE)
setwd("~/Work/tensorPredictors/sim/")
base.name <- format(Sys.time(), "sim_1c_normal-%Y%m%dT%H%M")
# Source utility function used in most simulations (extracted for convenience)
source("./sim_utils.R")
# Set PRNG seed for reproducability
# Note: `0x` is the HEX number prefix and the trailing `L` stands for "long"
# which is `R`s way if indicating an integer.
set.seed(0x1cL, "Mersenne-Twister", "Inversion", "Rejection")
### Simulation configuration
reps <- 100 # number of simulation replications
sample.sizes <- c(100, 200, 300, 500, 750) # sample sizes `n`
dimX <- c(2, 3, 5) # predictor `X` dimension
dimF <- rep(2, length(dimX)) # "function" `F(y)` of responce `y` dimension
# setup true model parameters (rank 1 betas)
betas <- list(
`[<-`(matrix(0, dimX[1], dimF[1]), 1, , c(1, 1)),
`[<-`(matrix(0, dimX[2], dimF[2]), 2, , c(1, 0)),
`[<-`(matrix(0, dimX[3], dimF[3]), 3, , c(1, 2))
)
# invisible(Map(print.table, betas, zero.print = "."))
Omegas <- Map(function(pj) 0.5^abs(outer(1:pj, 1:pj, `-`)), dimX) # AR(0.5)
eta1 <- 0
# True (full) reduction matrix to compair against
B.true <- as.matrix(as.numeric(15 == (1:30)))
# define projections onto rank 1 matrices for betas
proj.betas <- Map(.projRank, rep(1L, length(betas)))
# data sampling routine
sample.data <- function(sample.size, eta1, betas, Omegas) {
# responce is a standard normal variable
y <- rnorm(sample.size)
# F(y) is a tensor of monomials
y.pow <- Reduce(function(a, b) outer(a, b, `+`), Map(seq, 0L, len = dimF))
F <- t(outer(y, as.vector(y.pow), `^`))
dim(F) <- c(dimF, sample.size)
# sample predictors from tensor normal X | Y = y (last axis is sample axis)
sample.axis <- length(betas) + 1L
Deltas <- Map(solve, Omegas) # normal covariances
mu_y <- mlm(mlm(F, betas) + as.vector(eta1), Deltas) # conditional mean
X <- mu_y + rtensornorm(sample.size, 0, Deltas, sample.axis) # responses X
list(X = X, F = F, y = y, sample.axis = sample.axis)
}
# Create a CSV logger to write simulation results to
log.file <- paste(base.name, "csv", sep = ".")
logger <- CSV.logger(
file.name = log.file,
header = c("sample.size", "rep", outer(
c("time", "dist.subspace"), # measures
c("gmlm", "pca", "hopca", "tsir", "mgcca"), # methods
paste, sep = "."
))
)
# # true reduction (1D, select first component)
# B.true <- as.matrix(as.numeric(1 == (1:30)))
# B.true <- Reduce(`%x%`, rev(betas)) #[, 1L, drop = FALSE]
# print.table(B.true, zero.print = ".")
# print.table(Reduce(kronecker, rev(betas)), zero.print = ".")
# mu1 <- function(y) 1 + 3 * y + y^2
# matX <- mat(X, 1:3)
# x1 <- matX[1, ]
# x2 <- matX[2, ]
# plot(mu1(y), x1, col = cut(y, 10L))
# plot(y, x1, col = cut(y, 10L))
# plot(mu1(y), x2, col = cut(y, 10L))
# plot(y, matX[1, ], col = cut(y, 10L))
# plot(y, matX[10, ], col = cut(y, 10L))
# c(X, F, y, sample.axis) %<-% sample.data(sample.size, eta1, betas, Omegas)
# colnames(B.true) <- c("1", "y", "y", "y^2", "y", "y^2", "y^2", "y^3")
# mu_y <- function(y) {
# B.min <- cbind(
# B.true[, 1],
# rowSums(B.true[, c(2:3, 5)]),
# rowSums(B.true[, c(4, 6:7)]),
# B.true[, 8]
# )
# B.min %*% rbind(1, y, y^2, y^3)
# }
# plot((mat(X, 4) %*% B.min)[, 2], y)
### for each sample size
for (sample.size in sample.sizes) {
# repeate every simulation
for (rep in seq_len(reps)) {
# Sample training data
c(X, F, y, sample.axis) %<-% sample.data(sample.size, eta1, betas, Omegas)
# fit different models
# Wrapped in try-catch clock to ensure the simulation continues,
# if an error occures continue with nest resplication and log an error message
tryCatch({
time.gmlm <- system.time(
fit.gmlm <- gmlm_tensor_normal(X, F, sample.axis = sample.axis,
proj.betas = proj.betas)
)["user.self"]
time.pca <- system.time(
fit.pca <- prcomp(mat(X, sample.axis), rank. = 1L)
)["user.self"]
time.hopca <- system.time(
fit.hopca <- HOPCA(X, npc = c(1L, 1L, 1L), sample.axis = sample.axis)
)["user.self"]
time.tsir <- system.time(
fit.tsir <- TSIR(X, y, d = c(1L, 1L, 1L), sample.axis = sample.axis)
)["user.self"]
# `mgcca` expects the first axis to be the sample axis
X1 <- aperm(X, c(sample.axis, seq_along(dim(X))[-sample.axis]))
F1 <- cbind(y, y^2, y^3)
time.mgcca <- system.time(
fit.mgcca <- mgcca(
list(X1, F1), # `drop` removes 1D axis
quiet = TRUE,
scheme = "factorial",
ncomp = c(1L, 1L)
)
)["user.self"]
}, error = function(ex) {
print(ex)
})
# Compute true reduction matrix
B.gmlm <- Reduce(kronecker, Map(
function(beta) qr.Q(qr(beta))[, 1L, drop = FALSE],
rev(fit.gmlm$betas)))
B.pca <- fit.pca$rotation
B.hopca <- Reduce(`%x%`, rev(fit.hopca))
B.tsir <- Reduce(`%x%`, rev(fit.tsir))
B.mgcca <- fit.mgcca$astar[[1]]
# Subspace Distances: Normalized `|| P_A - P_B ||_F` where
# `P_A = A (A' A)^-1 A'` and the normalization means that with
# respect to the dimensions of `A, B` the subspace distance is in the
# range `[0, 1]`.
dist.subspace.gmlm <- dist.subspace(B.true, B.gmlm, normalize = TRUE)
dist.subspace.pca <- dist.subspace(B.true, B.pca, normalize = TRUE)
dist.subspace.hopca <- dist.subspace(B.true, B.hopca, normalize = TRUE)
dist.subspace.tsir <- dist.subspace(B.true, B.tsir, normalize = TRUE)
dist.subspace.mgcca <- dist.subspace(B.true, B.mgcca, normalize = TRUE)
# No projection distacne as in this case the Subspace and Projection
# distances are identical
# Call CSV logger writing results to file
logger()
# print progress
cat(sprintf("sample size (%d): %d/%d - rep: %d/%d\n",
sample.size, which(sample.size == sample.sizes),
length(sample.sizes), rep, reps))
}
}
### read simulation results generate plots
if (!interactive()) { pdf(file = paste(base.name, "pdf", sep = ".")) }
sim <- read.csv(log.file)
plot.sim(sim, "dist.subspace", main = "Subspace Distance",
xlab = "Sample Size", ylab = "Distance")
plot.sim(sim, "time", main = "Runtime",
xlab = "Sample Size", ylab = "Time [s]", ylim = c(0, 18))
# aggr <- aggregate(sim[, names(sim) != "sample.size"], list(sample.size = sim$sample.size), mean)