diff --git a/simulations/simulations.R b/simulations/simulations.R index 179517e..4c595bf 100644 --- a/simulations/simulations.R +++ b/simulations/simulations.R @@ -14,7 +14,7 @@ args <- parse.args(defaults = list( dataset = '1', # Name (number) of the data set # Neuronal Net. structure/definitions hidden_units = 512L, - activation = 'relu', # or `relu` + activation = 'relu', trainable_reduction = TRUE, # Neuronal Net. training epochs = c(200L, 400L), # Number of training epochs for (`OPG`, Refinement) diff --git a/simulations/simulations_bigdata.R b/simulations/simulations_bigdata.R index e05dfe6..757c1ce 100644 --- a/simulations/simulations_bigdata.R +++ b/simulations/simulations_bigdata.R @@ -12,6 +12,10 @@ args <- parse.args(defaults = list( # Simulation configuration reps = 10L, # Number of replications dataset = '6', # Name (number) of the data set + # Sets if reference methods shall be evaluated + run_mave = TRUE, + run_cve = TRUE, + run_nn = TRUE, # Neuronal Net. structure/definitions hidden_units = 512L, activation = 'relu', @@ -34,7 +38,7 @@ ds <- dataset(args$dataset, n = 100L, p = args$p) # Generates a list with `X`, ` ## Build Dimension Reduction Neuronal Network model (matching the data) nn <- nnsdr$new( input_shapes = list(x = ncol(ds$X)), - d = ncol(ds$B), + d = ncol(ds$B), # depends on the dataset type hidden_units = args$hidden_units, activation = args$activation, trainable_reduction = args$trainable_reduction @@ -53,51 +57,57 @@ for (rep in seq_len(args$reps)) { ## Sample test dataset ds.test <- dataset(ds$name, n = 1000L, p = args$p) - ## First the reference method `MAVE` - # To be fair for measuring the time, set `max.dim` to true reduction dimension - # and with `screen = ncol(X)` screening is turned "off". - time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B), - method = "meanMAVE", screen = ncol(X))) - d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE) - d.gra <- dist.grassmann(B, coef(dr, ncol(B))) - mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2) - cat('"mave",', rep, ',', d.sub, ',', d.gra, ',', mse, ',', - time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n', - sep = '', file = log, append = TRUE) - ## and the `OPG` method - time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B), - method = "meanOPG", screen = ncol(X))) - d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE) - d.gra <- dist.grassmann(B, coef(dr, ncol(B))) - mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2) - cat('"opg",', rep, ',', d.sub, ',', d.gra, ',', mse, ',', - time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n', - sep = '', file = log, append = TRUE) + if (args$run_mave) { + ## First the reference method `MAVE` + # To be fair for measuring the time, set `max.dim` to true reduction + # dimension and with `screen = ncol(X)` screening is turned "off". + time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B), + method = "meanMAVE", screen = ncol(X))) + d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(dr, ncol(B))) + mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2) + cat('"mave",', rep, ',', d.sub, ',', d.gra, ',', mse, ',', + time['user.self'], ',', time['sys.self'], ',', time['elapsed'], + '\n', sep = '', file = log, append = TRUE) + ## and the `OPG` method + time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B), + method = "meanOPG", screen = ncol(X))) + d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(dr, ncol(B))) + mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2) + cat('"opg",', rep, ',', d.sub, ',', d.gra, ',', mse, ',', + time['user.self'], ',', time['sys.self'], ',', time['elapsed'], + '\n', sep = '', file = log, append = TRUE) + } - ## Next the CVE method - time <- system.time(dr <- cve.call(X, Y, k = ncol(B))) - d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE) - d.gra <- dist.grassmann(B, coef(dr, ncol(B))) - mse <- mean((predict(dr, ds.test$X, k = ncol(B)) - ds.test$Y)^2) - cat('"cve",', rep, ',', d.sub, ',', d.gra, ',', mse, ',', - time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n', - sep = '', file = log, append = TRUE) + if (args$run_cve) { + ## Next the CVE method + time <- system.time(dr <- cve.call(X, Y, k = ncol(B))) + d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(dr, ncol(B))) + mse <- mean((predict(dr, ds.test$X, k = ncol(B)) - ds.test$Y)^2) + cat('"cve",', rep, ',', d.sub, ',', d.gra, ',', mse, ',', + time['user.self'], ',', time['sys.self'], ',', time['elapsed'], + '\n', sep = '', file = log, append = TRUE) + } - ## Fit `DR` Neuronal Network model - time <- system.time(nn$fit(X, Y, epochs = args$epochs, - batch_size = args$batch_size, initializer = args$initializer)) - # OPG estimate - d.sub <- dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE) - d.gra <- dist.grassmann(B, coef(nn, 'OPG')) - cat('"nn.opg",', rep, ',', d.sub, ',', d.gra, ',NA,NA,NA,NA\n', - sep = '', file = log, append = TRUE) - # Refinement estimate - d.sub <- dist.subspace(B, coef(nn), normalize = TRUE) - d.gra <- dist.grassmann(B, coef(nn)) - mse <- mean((nn$predict(ds.test$X) - ds.test$Y)^2) - cat('"nn.ref",', rep, ',', d.sub, ',', d.gra, ',', mse, ',', - time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n', - sep = '', file = log, append = TRUE) + if (args$run_nn) { + ## Fit `DR` Neuronal Network model + time <- system.time(nn$fit(X, Y, epochs = args$epochs, + batch_size = args$batch_size, initializer = args$initializer)) + # OPG estimate + d.sub <- dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(nn, 'OPG')) + cat('"nn.opg",', rep, ',', d.sub, ',', d.gra, ',NA,NA,NA,NA\n', + sep = '', file = log, append = TRUE) + # Refinement estimate + d.sub <- dist.subspace(B, coef(nn), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(nn)) + mse <- mean((nn$predict(ds.test$X) - ds.test$Y)^2) + cat('"nn.ref",', rep, ',', d.sub, ',', d.gra, ',', mse, ',', + time['user.self'], ',', time['sys.self'], ',', time['elapsed'], + '\n', sep = '', file = log, append = TRUE) + } }) ## Invoke the garbage collector diff --git a/simulations/simulations_bigdata.sh b/simulations/simulations_bigdata.sh new file mode 100644 index 0000000..d0ffe07 --- /dev/null +++ b/simulations/simulations_bigdata.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Catch termination signal `SIGINT` and invoke `user_interupt` +trap user_interupt SIGINT + +# Reports an user interupt and exits the simulation script (do not continue next +# statement, allows to exit bash loop with `^C`) +user_interupt() { + echo -e '\nUser Interrupt -> stopping simulation\n' + exit +} + +# Simulation for big data with `p` proportional to `sqrt(n)` +for ds in 6 8; do + command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=1000 --p=32 --epochs=200,400" + echo -e "\n$command" + time eval "$command" + command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=4000 --p=63 --epochs=100,200" + echo -e "\n$command" + time eval "$command" + command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=16000 --p=126 --epochs=50,100" + echo -e "\n$command" + time eval "$command" + command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=64000 --p=253 --epochs=25,50" + echo -e "\n$command" + time eval "$command" + command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=256000 --p=506 --epochs=12,25" + echo -e "\n$command" + time eval "$command" +done + +# Simulation for big data with `p` proportional to `n` (note: for the base case +# of `n = 1000`, `p = 32` see above) +# For i = 1, ..., 4 the sample size `n = 1000 * 4^i`, number of predictors +# `p = 32 * 4^i` and the training epochs `epochs ~ (200, 400) * 1.5^(-i)` +for ds in 6 8; do + command="Rscript simulations_bigdata.R --reps=10 --run_mave=TRUE --run_cve=TRUE --dataset=$ds --n=4000 --p=128 --epochs=133,266" + echo -e "\n$command" + time eval "$command" + command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=16000 --p=512 --epochs=88,176" + echo -e "\n$command" + time eval "$command" + command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=64000 --p=2048 --epochs=59,118" + echo -e "\n$command" + time eval "$command" + command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=256000 --p=8192 --epochs=39,78" + echo -e "\n$command" + time eval "$command" +done