add: simulation scripts
This commit is contained in:
		
							parent
							
								
									2c5a87ccfe
								
							
						
					
					
						commit
						0214823794
					
				
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@ -16,6 +16,7 @@ NNSDR/man/
 | 
			
		||||
*.Rdata
 | 
			
		||||
*.zip
 | 
			
		||||
*.tar.gz
 | 
			
		||||
*.tar.xz
 | 
			
		||||
*.BAK
 | 
			
		||||
 | 
			
		||||
# LaTeX - build/database/... files
 | 
			
		||||
 | 
			
		||||
@ -181,17 +181,7 @@ rlaplace <- function(n = 1, mu = 0, sd = 1) {
 | 
			
		||||
#' \eqn{e_j} is the \eqn{j}-th unit vector in the \eqn{p}-dimensional space.
 | 
			
		||||
#' \eqn{Y} is given as \deqn{Y = (b_1'X)^2+(b_2'X)^2+(b_3'X)^2+0.5\epsilon}
 | 
			
		||||
#' where \eqn{\epsilon} is standard normal.
 | 
			
		||||
#' @section M7:
 | 
			
		||||
#' The predictors are distributed as \eqn{X\sim t_3(I_p)}{X~t_3(I_p)} where
 | 
			
		||||
#' \eqn{t_3(I_p)} is the standard multivariate t-distribution with 3 degrees of
 | 
			
		||||
#' freedom, for a subspace dimension of \eqn{k = 4} with a default of
 | 
			
		||||
#' \eqn{n = 200} data points.
 | 
			
		||||
#' \eqn{p = 20, b_1 = e_1, b_2 = e_2, b_3 = e_3}, and \eqn{b_4 = e_p}, where
 | 
			
		||||
#' \eqn{e_j} is the \eqn{j}-th unit vector in the \eqn{p}-dimensional space.
 | 
			
		||||
#' \eqn{Y} is given as \deqn{Y = (b_1'X)(b_2'X)^2+(b_3'X)(b_4'X)+0.5\epsilon}
 | 
			
		||||
#' where \eqn{\epsilon} is distributed as generalized normal distribution with
 | 
			
		||||
#' location 0, shape-parameter 1, and the scale-parameter is chosen such that
 | 
			
		||||
#' \eqn{Var(\epsilon) = 0.25}.
 | 
			
		||||
#' @section M7: see "Local Linear Forests" <arXiv:1807.11408>
 | 
			
		||||
#'
 | 
			
		||||
#' @import stats
 | 
			
		||||
#' @importFrom stats rnorm rbinom
 | 
			
		||||
@ -253,18 +243,9 @@ dataset <- function(name = "M1", n = NULL, p = 20, sd = 0.5, ...) {
 | 
			
		||||
        X <- matrix(rnorm(n * p), n)
 | 
			
		||||
        Y <- rowSums((X %*% B)^2) + rnorm(n, 0, sd)
 | 
			
		||||
    } else if (name == "M7") {
 | 
			
		||||
        if (missing(n)) { n <- 400 }
 | 
			
		||||
        # B ... `p x 4`
 | 
			
		||||
        B <- diag(p)[, -(4:(p - 1))]
 | 
			
		||||
        # "R"andom "M"ulti"V"ariate "S"tudent
 | 
			
		||||
        X <- rmvt(n = n, sigma = diag(p), df = 3)
 | 
			
		||||
        XB <- X %*% B
 | 
			
		||||
        Y <- (XB[, 1]) * (XB[, 2])^2 + (XB[, 3]) * (XB[, 4])
 | 
			
		||||
        Y <- Y + rlaplace(n, 0, sd)
 | 
			
		||||
    } else if (name == "M8") {
 | 
			
		||||
        # see: "Local Linear Forests" <arXiv:1807.11408>
 | 
			
		||||
        if (missing(n)) { n <- 600 }
 | 
			
		||||
        if (missing(p)) { p <- 20 } # 10 and 50 in "Local Linear Forests"
 | 
			
		||||
        if (missing(p)) { p <- 20 }  # 10 and 50 in "Local Linear Forests"
 | 
			
		||||
        if (missing(sd)) { sd <- 5 } # or 20
 | 
			
		||||
 | 
			
		||||
        B <- diag(1, p, 4)
 | 
			
		||||
@ -274,11 +255,6 @@ dataset <- function(name = "M1", n = NULL, p = 20, sd = 0.5, ...) {
 | 
			
		||||
        XB <- X %*% B
 | 
			
		||||
        Y <- 10 * sin(pi * XB[, 1] * XB[, 2]) + 20 * (XB[, 3] - 0.5)^2 + 5 * XB[, 4] + rnorm(n, sd = sd)
 | 
			
		||||
        Y <- as.matrix(Y)
 | 
			
		||||
    } else if (name == "M9") {
 | 
			
		||||
        if (missing(n)) { n <- 300 }
 | 
			
		||||
        X <- matrix(rnorm(n * p), n, p)
 | 
			
		||||
        Y <- X[, 1] + (0.5 + X[, 2])^2 * rnorm(n)
 | 
			
		||||
        B <- diag(1, p, 2)
 | 
			
		||||
    } else {
 | 
			
		||||
        stop("Got unknown dataset name.")
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										48
									
								
								simulations/colin_test.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								simulations/colin_test.R
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,48 @@
 | 
			
		||||
library(MAVE)
 | 
			
		||||
library(CVarE)
 | 
			
		||||
library(NNSDR)
 | 
			
		||||
 | 
			
		||||
set.seed(797)
 | 
			
		||||
 | 
			
		||||
dataset <- function(n = 100, p = 10, sd = 0.5) {
 | 
			
		||||
    X <- matrix(rnorm(n * (p - 1)), n, p - 1)
 | 
			
		||||
    X <- cbind(-0.5 * (X[, 1] + X[, 2]) + 0.001 * rnorm(n), X)
 | 
			
		||||
    B <- diag(p)[, 4, drop = FALSE]
 | 
			
		||||
    Y <- as.matrix(X[, 4]^2 + rnorm(n, 0, sd))
 | 
			
		||||
    return(list(X = X, Y = Y, B = B))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
nn <- nnsdr$new(
 | 
			
		||||
    input_shapes = list(x = 10L),
 | 
			
		||||
    d = 1L,
 | 
			
		||||
    hidden_units = 512L,
 | 
			
		||||
    activation = 'relu',
 | 
			
		||||
    trainable_reduction = TRUE
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
sim <- data.frame(mave = rep(NA, 10), cve = NA, nn.opg = NA, nn.ref = NA)
 | 
			
		||||
for (i in 1:10) {
 | 
			
		||||
    cat(i, "/ 10\n")
 | 
			
		||||
    with(dataset(), {
 | 
			
		||||
        dr <- mave.compute(X, Y, method = 'meanMAVE', max.dim = 1)
 | 
			
		||||
        sim[i, 'mave'] <<- dist.subspace(B, coef(dr, 1), normalize = TRUE)
 | 
			
		||||
 | 
			
		||||
        dr <- cve.call(X, Y, k = 1)
 | 
			
		||||
        sim[i, 'cve'] <<- dist.subspace(B, coef(dr, 1), normalize = TRUE)
 | 
			
		||||
 | 
			
		||||
        nn$fit(X, Y, epochs = c(200L, 400L), batch_size = 32L, initializer = 'fromOPG')
 | 
			
		||||
        sim[i, 'nn.opg'] <<- dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE)
 | 
			
		||||
        sim[i, 'nn.ref'] <<- dist.subspace(B, coef(nn), normalize = TRUE)
 | 
			
		||||
    })
 | 
			
		||||
    nn$reset()
 | 
			
		||||
}
 | 
			
		||||
round(t(data.frame(
 | 
			
		||||
    mean   = colMeans(sim),
 | 
			
		||||
    median = apply(sim, 2, median),
 | 
			
		||||
    sd     = apply(sim, 2, sd)
 | 
			
		||||
)), 3)
 | 
			
		||||
 | 
			
		||||
#        &$\mave$& $\cve$&$\nn_{512}$ \\
 | 
			
		||||
# mean   & 0.917 & 0.164 & 0.101 \\
 | 
			
		||||
# median & 0.999 & 0.162 & 0.096 \\
 | 
			
		||||
# sd     & 0.256 & 0.057 & 0.032 \\
 | 
			
		||||
							
								
								
									
										98
									
								
								simulations/simulations.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										98
									
								
								simulations/simulations.R
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,98 @@
 | 
			
		||||
#!/usr/bin/env Rscript
 | 
			
		||||
 | 
			
		||||
library(MAVE)
 | 
			
		||||
library(CVarE)
 | 
			
		||||
Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings
 | 
			
		||||
suppressPackageStartupMessages({
 | 
			
		||||
    library(NNSDR)
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
## Parse script parameters
 | 
			
		||||
args <- parse.args(defaults = list(
 | 
			
		||||
    # Simulation configuration
 | 
			
		||||
    reps = 100,             # Number of replications
 | 
			
		||||
    dataset = '1',          # Name (number) of the data set
 | 
			
		||||
    # Neuronal Net. structure/definitions
 | 
			
		||||
    hidden_units = 512L,
 | 
			
		||||
    activation = 'relu',    # or `relu`
 | 
			
		||||
    trainable_reduction = TRUE,
 | 
			
		||||
    # Neuronal Net. training
 | 
			
		||||
    epochs = c(200L, 400L), # Number of training epochs for (`OPG`, Refinement)
 | 
			
		||||
    batch_size = 32L,
 | 
			
		||||
    initializer = 'fromOPG',
 | 
			
		||||
    seed = 1390L
 | 
			
		||||
))
 | 
			
		||||
 | 
			
		||||
## Generate reference data (gets re-sampled for each replication)
 | 
			
		||||
ds <- dataset(args$dataset) # Generates a list with `X`, `Y`, `B` and `name`
 | 
			
		||||
 | 
			
		||||
## Build Dimension Reduction Neuronal Network model (matching the data)
 | 
			
		||||
nn <- nnsdr$new(
 | 
			
		||||
    input_shapes = list(x = ncol(ds$X)),
 | 
			
		||||
    d = ncol(ds$B),
 | 
			
		||||
    hidden_units = args$hidden_units,
 | 
			
		||||
    activation = args$activation,
 | 
			
		||||
    trainable_reduction = args$trainable_reduction
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
## Open simulation log file, write simulation configuration and header
 | 
			
		||||
log <- file(format(Sys.time(), "results/sim_%Y%m%d_%H%M.csv"), "w", blocking = FALSE)
 | 
			
		||||
cat(paste('#', names(args), args, sep = ' ', collapse = '\n'), '\n',
 | 
			
		||||
    'method,replication,dist.subspace,dist.grassmann,mse\n',
 | 
			
		||||
    sep = '', file = log, append = TRUE)
 | 
			
		||||
 | 
			
		||||
## Set seed for sampling simulation data (NOT effecting the `NN` initialization)
 | 
			
		||||
set.seed(args$seed)
 | 
			
		||||
 | 
			
		||||
## Repeated simulation runs
 | 
			
		||||
for (rep in seq_len(args$reps)) {
 | 
			
		||||
    ## Re-sample seeded data for each simulation replication
 | 
			
		||||
    with(dataset(ds$name), {
 | 
			
		||||
        ## Sample test dataset
 | 
			
		||||
        ds.test <- dataset(ds$name, n = 1000)
 | 
			
		||||
 | 
			
		||||
        ## First the reference method `MAVE`
 | 
			
		||||
        dr <- mave(Y ~ X, method = "meanMAVE")
 | 
			
		||||
        d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
 | 
			
		||||
        d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
 | 
			
		||||
        mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
 | 
			
		||||
        cat('"mave",', rep, ',', d.sub, ',', d.gra, ',', mse, '\n', sep = '',
 | 
			
		||||
            file = log, append = TRUE)
 | 
			
		||||
        ## and the `OPG` method
 | 
			
		||||
        dr <- mave(Y ~ X, method = "meanOPG")
 | 
			
		||||
        d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
 | 
			
		||||
        d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
 | 
			
		||||
        mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
 | 
			
		||||
        cat('"opg",', rep, ',', d.sub, ',', d.gra, ',', mse, '\n', sep = '',
 | 
			
		||||
            file = log, append = TRUE)
 | 
			
		||||
 | 
			
		||||
        ## Next the `CVE` method
 | 
			
		||||
        dr <- cve(Y ~ X, k = ncol(B))
 | 
			
		||||
        d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
 | 
			
		||||
        d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
 | 
			
		||||
        mse <- mean((predict(dr, ds.test$X, k = ncol(B)) - ds.test$Y)^2)
 | 
			
		||||
        cat('"cve",', rep, ',', d.sub, ',', d.gra, ',', mse, '\n', sep = '',
 | 
			
		||||
            file = log, append = TRUE)
 | 
			
		||||
 | 
			
		||||
        ## Fit `NNSDR` model
 | 
			
		||||
        nn$fit(X, Y, epochs = args$epochs,
 | 
			
		||||
            batch_size = args$batch_size, initializer = args$initializer)
 | 
			
		||||
        # `OPG` estimate
 | 
			
		||||
        d.sub <- dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE)
 | 
			
		||||
        d.gra <- dist.grassmann(B, coef(nn, 'OPG'))
 | 
			
		||||
        cat('"nn.opg",', rep, ',', d.sub, ',', d.gra, ',', NA, '\n', sep = '',
 | 
			
		||||
            file = log, append = TRUE)
 | 
			
		||||
        # Refinement estimate
 | 
			
		||||
        d.sub <- dist.subspace(B, coef(nn), normalize = TRUE)
 | 
			
		||||
        d.gra <- dist.grassmann(B, coef(nn))
 | 
			
		||||
        mse <- mean((nn$predict(ds.test$X) - ds.test$Y)^2)
 | 
			
		||||
        cat('"nn.ref",', rep, ',', d.sub, ',', d.gra, ',', mse, '\n', sep = '',
 | 
			
		||||
            file = log, append = TRUE)
 | 
			
		||||
    })
 | 
			
		||||
 | 
			
		||||
    ## Reset model
 | 
			
		||||
    nn$reset()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
## Finished, close simulation log file
 | 
			
		||||
close(log)
 | 
			
		||||
							
								
								
									
										111
									
								
								simulations/simulations_bigdata.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										111
									
								
								simulations/simulations_bigdata.R
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,111 @@
 | 
			
		||||
#!/usr/bin/env Rscript
 | 
			
		||||
 | 
			
		||||
library(MAVE)
 | 
			
		||||
library(CVarE)
 | 
			
		||||
Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings
 | 
			
		||||
suppressPackageStartupMessages({
 | 
			
		||||
    library(NNSDR)
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
## Parse script parameters
 | 
			
		||||
args <- parse.args(defaults = list(
 | 
			
		||||
    # Simulation configuration
 | 
			
		||||
    reps = 10L,             # Number of replications
 | 
			
		||||
    dataset = '6',          # Name (number) of the data set
 | 
			
		||||
    # Neuronal Net. structure/definitions
 | 
			
		||||
    hidden_units = 512L,
 | 
			
		||||
    activation = 'relu',
 | 
			
		||||
    trainable_reduction = TRUE,
 | 
			
		||||
    # Neuronal Net. training
 | 
			
		||||
    epochs = c(200L, 400L), # Number of training epochs for (`OPG`, Refinement)
 | 
			
		||||
    batch_size = 32L,
 | 
			
		||||
    initializer = 'fromOPG',
 | 
			
		||||
    # Simulation data generation configuration
 | 
			
		||||
    seed = 1390L,
 | 
			
		||||
    n = 100L,
 | 
			
		||||
    p = 20L
 | 
			
		||||
))
 | 
			
		||||
 | 
			
		||||
## Generate reference data (gets re-sampled for each replication)
 | 
			
		||||
# Number of observations are irrelevant for the reference to generate a matching
 | 
			
		||||
# `NNSDR` estimator
 | 
			
		||||
ds <- dataset(args$dataset, n = 100L, p = args$p) # Generates a list with `X`, `Y`, `B` and `name`
 | 
			
		||||
 | 
			
		||||
## Build Dimension Reduction Neuronal Network model (matching the data)
 | 
			
		||||
nn <- nnsdr$new(
 | 
			
		||||
    input_shapes = list(x = ncol(ds$X)),
 | 
			
		||||
    d = ncol(ds$B),
 | 
			
		||||
    hidden_units = args$hidden_units,
 | 
			
		||||
    activation = args$activation,
 | 
			
		||||
    trainable_reduction = args$trainable_reduction
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
## Open simulation log file, write simulation configuration and header
 | 
			
		||||
log <- file(format(Sys.time(), "results/sim_big_%Y%m%d_%H%M.csv"), "w", blocking = FALSE)
 | 
			
		||||
cat(paste('#', names(args), args, sep = ' ', collapse = '\n'), '\n',
 | 
			
		||||
    'method,replication,dist.subspace,dist.grassmann,mse,time.user,time.system,time.elapsed\n',
 | 
			
		||||
    sep = '', file = log, append = TRUE)
 | 
			
		||||
 | 
			
		||||
## Repeated simulation runs
 | 
			
		||||
for (rep in seq_len(args$reps)) {
 | 
			
		||||
    ## Re-sample seeded data for each simulation replication
 | 
			
		||||
    with(dataset(ds$name, n = args$n, p = args$p), {
 | 
			
		||||
        ## Sample test dataset
 | 
			
		||||
        ds.test <- dataset(ds$name, n = 1000L, p = args$p)
 | 
			
		||||
 | 
			
		||||
        ## First the reference method `MAVE`
 | 
			
		||||
        # To be fair for measuring the time, set `max.dim` to true reduction dimension
 | 
			
		||||
        # and with `screen = ncol(X)` screening is turned "off".
 | 
			
		||||
        time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B),
 | 
			
		||||
            method = "meanMAVE", screen = ncol(X)))
 | 
			
		||||
        d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
 | 
			
		||||
        d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
 | 
			
		||||
        mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
 | 
			
		||||
        cat('"mave",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
 | 
			
		||||
            time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
 | 
			
		||||
            sep = '', file = log, append = TRUE)
 | 
			
		||||
        ## and the `OPG` method
 | 
			
		||||
        time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B),
 | 
			
		||||
            method = "meanOPG", screen = ncol(X)))
 | 
			
		||||
        d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
 | 
			
		||||
        d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
 | 
			
		||||
        mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
 | 
			
		||||
        cat('"opg",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
 | 
			
		||||
            time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
 | 
			
		||||
            sep = '', file = log, append = TRUE)
 | 
			
		||||
 | 
			
		||||
        ## Next the CVE method
 | 
			
		||||
        time <- system.time(dr <- cve.call(X, Y, k = ncol(B)))
 | 
			
		||||
        d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
 | 
			
		||||
        d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
 | 
			
		||||
        mse <- mean((predict(dr, ds.test$X, k = ncol(B)) - ds.test$Y)^2)
 | 
			
		||||
        cat('"cve",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
 | 
			
		||||
            time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
 | 
			
		||||
            sep = '', file = log, append = TRUE)
 | 
			
		||||
 | 
			
		||||
        ## Fit `DR` Neuronal Network model
 | 
			
		||||
        time <- system.time(nn$fit(X, Y, epochs = args$epochs,
 | 
			
		||||
            batch_size = args$batch_size, initializer = args$initializer))
 | 
			
		||||
        # OPG estimate
 | 
			
		||||
        d.sub <- dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE)
 | 
			
		||||
        d.gra <- dist.grassmann(B, coef(nn, 'OPG'))
 | 
			
		||||
        cat('"nn.opg",', rep, ',', d.sub, ',', d.gra, ',NA,NA,NA,NA\n',
 | 
			
		||||
            sep = '', file = log, append = TRUE)
 | 
			
		||||
        # Refinement estimate
 | 
			
		||||
        d.sub <- dist.subspace(B, coef(nn), normalize = TRUE)
 | 
			
		||||
        d.gra <- dist.grassmann(B, coef(nn))
 | 
			
		||||
        mse <- mean((nn$predict(ds.test$X) - ds.test$Y)^2)
 | 
			
		||||
        cat('"nn.ref",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
 | 
			
		||||
            time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
 | 
			
		||||
            sep = '', file = log, append = TRUE)
 | 
			
		||||
    })
 | 
			
		||||
 | 
			
		||||
    ## Invoke the garbage collector
 | 
			
		||||
    gc()
 | 
			
		||||
 | 
			
		||||
    ## Reset model
 | 
			
		||||
    nn$reset()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
## Finished, close simulation log file
 | 
			
		||||
close(log)
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user