232 lines
		
	
	
		
			8.8 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
			
		
		
	
	
			232 lines
		
	
	
		
			8.8 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
library(tensorPredictors)
 | 
						|
# library(RGCCA)    # for `mgcca`
 | 
						|
### Load modified version which _does not_ create a clusster in case of
 | 
						|
### `n_cores == 1` allowing huge speed improvements! (at least on Ubuntu 22.04 LTS)
 | 
						|
### Moreover, it is compatible with `Rscript`
 | 
						|
devtools::load_all("~/Work/tensorPredictors/References/Software/TGCCA-modified", export_all = FALSE)
 | 
						|
 | 
						|
 | 
						|
setwd("~/Work/tensorPredictors/sim/")
 | 
						|
base.name <- format(Sys.time(), "sim_1e_normal-%Y%m%dT%H%M")
 | 
						|
 | 
						|
# Source utility function used in most simulations (extracted for convenience)
 | 
						|
source("./sim_utils.R")
 | 
						|
 | 
						|
# Set PRNG seed for reproducability
 | 
						|
# Note: `0x` is the HEX number prefix and the trailing `L` stands for "long"
 | 
						|
# which is `R`s way if indicating an integer.
 | 
						|
set.seed(0x1eL, "Mersenne-Twister", "Inversion", "Rejection")
 | 
						|
 | 
						|
### Simulation configuration
 | 
						|
reps <- 100                     # number of simulation replications
 | 
						|
sample.sizes <- c(100, 200, 300, 500, 750)  # sample sizes `n`
 | 
						|
dimX <- c(5, 5)               # predictor `X` dimension
 | 
						|
dimF <- c(2, 2)
 | 
						|
 | 
						|
 | 
						|
Sigma <- 0.5^abs(outer(1:prod(dimX), 1:prod(dimX), `-`))   # AR(0.5)
 | 
						|
 | 
						|
# # define projections onto tri-diagonal matrixes
 | 
						|
# proj.betas <- Map(.projRank, rep(1L, length(dimX)))     # wrong assumption of low rank betas
 | 
						|
# proj.Omegas <- Map(.projBand, Map(c, dimX, dimX), 1L, 1L) # wrong assumption of band Scatter matrices
 | 
						|
 | 
						|
# data sampling routine
 | 
						|
sample.data <- function(sample.size, dimX, dimF, B.true, Sigma) {
 | 
						|
    y <- rnorm(sample.size)
 | 
						|
 | 
						|
    # the true F (in vectorized form)
 | 
						|
    vecF <- rbind(1, sin(y), cos(y), sin(y) * cos(y))
 | 
						|
 | 
						|
    # sample vectorized X as a multi-variate normal (in vectorized form)
 | 
						|
    vecX <- B.true %*% vecF + t(chol(Sigma)) %*% matrix(rnorm(prod(sample.size, dimX)), prod(dimX))
 | 
						|
    X <- array(vecX, c(dimX, sample.size))
 | 
						|
 | 
						|
    list(X, vecF, y, length(dim(X)))
 | 
						|
}
 | 
						|
 | 
						|
# wrong assumption about the function `F(y)`
 | 
						|
F.wrong <- function(y) array(rbind(1, y, y, y^2), c(2, 2, length(y)))
 | 
						|
 | 
						|
 | 
						|
# Create a CSV logger to write simulation results to
 | 
						|
log.file <- paste(base.name, "csv", sep = ".")
 | 
						|
logger <- CSV.logger(
 | 
						|
    file.name = log.file,
 | 
						|
    header = c("sample.size", "rep", outer(
 | 
						|
        c("time", "dist.subspace", "dist.projection"),  # measures
 | 
						|
        c("gmlm", "pca", "hopca", "tsir", "mgcca"),     # methods
 | 
						|
        paste, sep = "."
 | 
						|
    ))
 | 
						|
)
 | 
						|
 | 
						|
# Miss-specify true beta as _not_ a kronecker product
 | 
						|
B.true <- diag(1, prod(dimX), prod(dimF))
 | 
						|
 | 
						|
### for each sample size
 | 
						|
for (sample.size in sample.sizes) {
 | 
						|
    # repeate every simulation
 | 
						|
    for (rep in seq_len(reps)) {
 | 
						|
        # Sample training data
 | 
						|
        c(X, F.true, y, sample.axis) %<-% sample.data(sample.size, dimX, dimF, B.true, Sigma)
 | 
						|
 | 
						|
        # fit different models
 | 
						|
        # Wrapped in try-catch clock to ensure the simulation continues,
 | 
						|
        # if an error occures continue with nest resplication and log an error message
 | 
						|
        tryCatch({
 | 
						|
            time.gmlm <- system.time(
 | 
						|
                fit.gmlm <- gmlm_tensor_normal(X, F.wrong(y), sample.axis = sample.axis)
 | 
						|
            )["user.self"]
 | 
						|
            time.pca <- system.time(
 | 
						|
                fit.pca <- prcomp(mat(X, sample.axis), rank. = prod(dimF))
 | 
						|
            )["user.self"]
 | 
						|
            time.hopca <- system.time(
 | 
						|
                fit.hopca <- HOPCA(X, npc = dimF, sample.axis = sample.axis)
 | 
						|
            )["user.self"]
 | 
						|
            time.tsir <- system.time(
 | 
						|
                fit.tsir <- TSIR(X, y, dimF, sample.axis = sample.axis)
 | 
						|
            )["user.self"]
 | 
						|
            # `mgcca` expects the first axis to be the sample axis
 | 
						|
            X1 <- aperm(X, c(sample.axis, seq_along(dim(X))[-sample.axis]))
 | 
						|
            time.mgcca <- system.time(
 | 
						|
                fit.mgcca <- mgcca(
 | 
						|
                    list(X1, y),
 | 
						|
                    quiet = TRUE,
 | 
						|
                    scheme = "factorial",
 | 
						|
                    ncomp = c(prod(dimF), 1),
 | 
						|
                    ranks = c(prod(dimF), 1)
 | 
						|
                )
 | 
						|
            )["user.self"]
 | 
						|
        }, error = function(ex) {
 | 
						|
            print(ex)
 | 
						|
        })
 | 
						|
 | 
						|
        # Compute true reduction matrix
 | 
						|
        B.gmlm <- with(fit.gmlm, Reduce(`%x%`, rev(betas)))
 | 
						|
        B.pca <- fit.pca$rotation
 | 
						|
        B.hopca <- Reduce(`%x%`, rev(fit.hopca))
 | 
						|
        B.tsir <- Reduce(`%x%`, rev(fit.tsir))
 | 
						|
        B.mgcca <- fit.mgcca$astar[[1]]
 | 
						|
 | 
						|
        # Subspace Distances: Normalized `|| P_A - P_B ||_F` where
 | 
						|
        #   `P_A = A (A' A)^-1 A'` and the normalization means that with
 | 
						|
        #   respect to the dimensions of `A, B` the subspace distance is in the
 | 
						|
        #   range `[0, 1]`.
 | 
						|
        dist.subspace.gmlm  <- dist.subspace(B.true, B.gmlm,  normalize = TRUE)
 | 
						|
        dist.subspace.pca   <- dist.subspace(B.true, B.pca,   normalize = TRUE)
 | 
						|
        dist.subspace.hopca <- dist.subspace(B.true, B.hopca, normalize = TRUE)
 | 
						|
        dist.subspace.tsir  <- dist.subspace(B.true, B.tsir,  normalize = TRUE)
 | 
						|
        dist.subspace.mgcca <- dist.subspace(B.true, B.mgcca, normalize = TRUE)
 | 
						|
 | 
						|
        # Projection Distances: Spectral norm (2-norm) `|| P_A - P_B ||_2`.
 | 
						|
        dist.projection.gmlm  <- dist.projection(B.true, B.gmlm)
 | 
						|
        dist.projection.pca   <- dist.projection(B.true, B.pca)
 | 
						|
        dist.projection.hopca <- dist.projection(B.true, B.hopca)
 | 
						|
        dist.projection.tsir  <- dist.projection(B.true, B.tsir)
 | 
						|
        dist.projection.mgcca <- dist.projection(B.true, B.mgcca)
 | 
						|
 | 
						|
        # Call CSV logger writing results to file
 | 
						|
        logger()
 | 
						|
 | 
						|
        # print progress
 | 
						|
        cat(sprintf("sample size (%d): %d/%d - rep: %d/%d\n",
 | 
						|
            sample.size, which(sample.size == sample.sizes),
 | 
						|
            length(sample.sizes), rep, reps))
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
### read simulation results generate plots
 | 
						|
if (!interactive()) { pdf(file = paste(base.name, "pdf", sep = ".")) }
 | 
						|
 | 
						|
sim <- read.csv(log.file)
 | 
						|
 | 
						|
# Remain sample size grouping variable to avoid conflicts
 | 
						|
aggr.mean   <- aggregate(sim, list(sampleSize = sim$sample.size), mean)
 | 
						|
aggr.median <- aggregate(sim, list(sampleSize = sim$sample.size), median)
 | 
						|
aggr.sd     <- aggregate(sim, list(sampleSize = sim$sample.size), sd)
 | 
						|
aggr.min    <- aggregate(sim, list(sampleSize = sim$sample.size), min)
 | 
						|
aggr.max    <- aggregate(sim, list(sampleSize = sim$sample.size), max)
 | 
						|
 | 
						|
par(pch = 16, bty = "n", lty = "solid", lwd = 1.5)
 | 
						|
cols <- c(gmlm = "blue", pca = "darkcyan", hopca = "red", tsir = "darkgreen",
 | 
						|
    mgcca = "purple")
 | 
						|
 | 
						|
with(aggr.mean, {
 | 
						|
    plot(range(sampleSize), c(0, 1), type = "n",
 | 
						|
        main = "Subspace Distance",
 | 
						|
        xlab = "Sample Size",
 | 
						|
        ylab = "Distance"
 | 
						|
    )
 | 
						|
    for (dist.name in ls(pattern = "^dist.subspace")) {
 | 
						|
        mean <- get(dist.name)
 | 
						|
        median <- aggr.median[aggr.sd$sampleSize == sampleSize, dist.name]
 | 
						|
        sd     <-     aggr.sd[aggr.sd$sampleSize == sampleSize, dist.name]
 | 
						|
        min    <-    aggr.min[aggr.sd$sampleSize == sampleSize, dist.name]
 | 
						|
        max    <-    aggr.max[aggr.sd$sampleSize == sampleSize, dist.name]
 | 
						|
        method <- tail(strsplit(dist.name, ".", fixed = TRUE)[[1]], 1)
 | 
						|
        col <- cols[method]
 | 
						|
        lines(sampleSize, mean, type = "o", col = col, lty = 1, lwd = 1)
 | 
						|
        lines(sampleSize, mean + sd,        col = col, lty = 2, lwd = 0.8)
 | 
						|
        lines(sampleSize, mean - sd,        col = col, lty = 2, lwd = 0.8)
 | 
						|
        lines(sampleSize, median,           col = col, lty = 1, lwd = 1)
 | 
						|
        lines(sampleSize, min,              col = col, lty = 3, lwd = 0.6)
 | 
						|
        lines(sampleSize, max,              col = col, lty = 3, lwd = 0.6)
 | 
						|
    }
 | 
						|
 | 
						|
    legend("topright", col = cols, lty = 1, legend = names(cols),
 | 
						|
        bty = "n", lwd = par("lwd"), pch = par("pch"))
 | 
						|
})
 | 
						|
 | 
						|
with(aggr.mean, {
 | 
						|
    plot(range(sampleSize), c(0, 1), type = "n",
 | 
						|
        main = "Projection Distance",
 | 
						|
        xlab = "Sample Size",
 | 
						|
        ylab = "Distance"
 | 
						|
    )
 | 
						|
    for (dist.name in ls(pattern = "^dist.projection")) {
 | 
						|
        mean <- get(dist.name)
 | 
						|
        median <- aggr.median[aggr.sd$sampleSize == sampleSize, dist.name]
 | 
						|
        sd     <-     aggr.sd[aggr.sd$sampleSize == sampleSize, dist.name]
 | 
						|
        min    <-    aggr.min[aggr.sd$sampleSize == sampleSize, dist.name]
 | 
						|
        max    <-    aggr.max[aggr.sd$sampleSize == sampleSize, dist.name]
 | 
						|
        method <- tail(strsplit(dist.name, ".", fixed = TRUE)[[1]], 1)
 | 
						|
        col <- cols[method]
 | 
						|
        lines(sampleSize, mean, type = "o", col = col, lty = 1, lwd = 1)
 | 
						|
        lines(sampleSize, mean + sd,        col = col, lty = 2, lwd = 0.8)
 | 
						|
        lines(sampleSize, mean - sd,        col = col, lty = 2, lwd = 0.8)
 | 
						|
        lines(sampleSize, median,           col = col, lty = 1, lwd = 1)
 | 
						|
        lines(sampleSize, min,              col = col, lty = 3, lwd = 0.6)
 | 
						|
        lines(sampleSize, max,              col = col, lty = 3, lwd = 0.6)
 | 
						|
    }
 | 
						|
 | 
						|
    legend("topright", col = cols, lty = 1, legend = names(cols),
 | 
						|
        bty = "n", lwd = par("lwd"), pch = par("pch"))
 | 
						|
})
 | 
						|
 | 
						|
 | 
						|
 | 
						|
# sample.axis <- 1L
 | 
						|
# F.wrong <- array(outer(y, c(0, 1, 1, 2, 1, 2, 2, 3), `^`), c(10, 2, 2, 2))
 | 
						|
 | 
						|
 | 
						|
# F.true <- array(c(1, sin(y), cos(y)))
 | 
						|
 | 
						|
 | 
						|
# 1 s c
 | 
						|
# s ss cs
 | 
						|
# c sc cc
 | 
						|
 | 
						|
# s ss cs
 | 
						|
# ss sss css
 | 
						|
# cs scs ccs
 | 
						|
 | 
						|
# c sc cc
 | 
						|
# sc ssc csc
 | 
						|
# cc scc ccc
 | 
						|
 | 
						|
# osc <- 
 | 
						|
 | 
						|
# I <- array(0, c(3, 3, 3))
 | 
						|
 | 
						|
# slice.index()
 |