add: big data simulation with p growing proportional to n

2021-09-01 12:44:05 +02:00 · 2021-09-01 12:44:05 +02:00 · 69a008535b
commit 69a008535b
parent 0214823794
3 changed files with 104 additions and 45 deletions
--- a/simulations/simulations.R
+++ b/simulations/simulations.R
@ -14,7 +14,7 @@ args <- parse.args(defaults = list(
    dataset = '1',          # Name (number) of the data set
    # Neuronal Net. structure/definitions
    hidden_units = 512L,
-    activation = 'relu',    # or `relu`
+    activation = 'relu',
    trainable_reduction = TRUE,
    # Neuronal Net. training
    epochs = c(200L, 400L), # Number of training epochs for (`OPG`, Refinement)
--- a/simulations/simulations_bigdata.R
+++ b/simulations/simulations_bigdata.R
@ -12,6 +12,10 @@ args <- parse.args(defaults = list(
    # Simulation configuration
    reps = 10L,             # Number of replications
    dataset = '6',          # Name (number) of the data set
+    # Sets if reference methods shall be evaluated
+    run_mave = TRUE,
+    run_cve = TRUE,
+    run_nn = TRUE,
    # Neuronal Net. structure/definitions
    hidden_units = 512L,
    activation = 'relu',
@ -34,7 +38,7 @@ ds <- dataset(args$dataset, n = 100L, p = args$p) # Generates a list with `X`, `
 ## Build Dimension Reduction Neuronal Network model (matching the data)
 nn <- nnsdr$new(
    input_shapes = list(x = ncol(ds$X)),
-    d = ncol(ds$B),
+    d = ncol(ds$B), # depends on the dataset type
    hidden_units = args$hidden_units,
    activation = args$activation,
    trainable_reduction = args$trainable_reduction
@ -53,51 +57,57 @@ for (rep in seq_len(args$reps)) {
        ## Sample test dataset
        ds.test <- dataset(ds$name, n = 1000L, p = args$p)

-        ## First the reference method `MAVE`
-        # To be fair for measuring the time, set `max.dim` to true reduction dimension
-        # and with `screen = ncol(X)` screening is turned "off".
-        time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B),
-            method = "meanMAVE", screen = ncol(X)))
-        d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
-        d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
-        mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
-        cat('"mave",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
-            time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
-            sep = '', file = log, append = TRUE)
-        ## and the `OPG` method
-        time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B),
-            method = "meanOPG", screen = ncol(X)))
-        d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
-        d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
-        mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
-        cat('"opg",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
-            time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
-            sep = '', file = log, append = TRUE)
+        if (args$run_mave) {
+            ## First the reference method `MAVE`
+            # To be fair for measuring the time, set `max.dim` to true reduction
+            # dimension and with `screen = ncol(X)` screening is turned "off".
+            time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B),
+                method = "meanMAVE", screen = ncol(X)))
+            d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
+            d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
+            mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
+            cat('"mave",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
+                time['user.self'], ',', time['sys.self'], ',', time['elapsed'],
+                '\n', sep = '', file = log, append = TRUE)
+            ## and the `OPG` method
+            time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B),
+                method = "meanOPG", screen = ncol(X)))
+            d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
+            d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
+            mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
+            cat('"opg",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
+                time['user.self'], ',', time['sys.self'], ',', time['elapsed'],
+                '\n', sep = '', file = log, append = TRUE)
+        }

-        ## Next the CVE method
-        time <- system.time(dr <- cve.call(X, Y, k = ncol(B)))
-        d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
-        d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
-        mse <- mean((predict(dr, ds.test$X, k = ncol(B)) - ds.test$Y)^2)
-        cat('"cve",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
-            time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
-            sep = '', file = log, append = TRUE)
+        if (args$run_cve) {
+            ## Next the CVE method
+            time <- system.time(dr <- cve.call(X, Y, k = ncol(B)))
+            d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
+            d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
+            mse <- mean((predict(dr, ds.test$X, k = ncol(B)) - ds.test$Y)^2)
+            cat('"cve",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
+                time['user.self'], ',', time['sys.self'], ',', time['elapsed'],
+                '\n', sep = '', file = log, append = TRUE)
+        }

-        ## Fit `DR` Neuronal Network model
-        time <- system.time(nn$fit(X, Y, epochs = args$epochs,
-            batch_size = args$batch_size, initializer = args$initializer))
-        # OPG estimate
-        d.sub <- dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE)
-        d.gra <- dist.grassmann(B, coef(nn, 'OPG'))
-        cat('"nn.opg",', rep, ',', d.sub, ',', d.gra, ',NA,NA,NA,NA\n',
-            sep = '', file = log, append = TRUE)
-        # Refinement estimate
-        d.sub <- dist.subspace(B, coef(nn), normalize = TRUE)
-        d.gra <- dist.grassmann(B, coef(nn))
-        mse <- mean((nn$predict(ds.test$X) - ds.test$Y)^2)
-        cat('"nn.ref",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
-            time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
-            sep = '', file = log, append = TRUE)
+        if (args$run_nn) {
+            ## Fit `DR` Neuronal Network model
+            time <- system.time(nn$fit(X, Y, epochs = args$epochs,
+                batch_size = args$batch_size, initializer = args$initializer))
+            # OPG estimate
+            d.sub <- dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE)
+            d.gra <- dist.grassmann(B, coef(nn, 'OPG'))
+            cat('"nn.opg",', rep, ',', d.sub, ',', d.gra, ',NA,NA,NA,NA\n',
+                sep = '', file = log, append = TRUE)
+            # Refinement estimate
+            d.sub <- dist.subspace(B, coef(nn), normalize = TRUE)
+            d.gra <- dist.grassmann(B, coef(nn))
+            mse <- mean((nn$predict(ds.test$X) - ds.test$Y)^2)
+            cat('"nn.ref",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
+                time['user.self'], ',', time['sys.self'], ',', time['elapsed'],
+                '\n', sep = '', file = log, append = TRUE)
+        }
    })

    ## Invoke the garbage collector
--- a/simulations/simulations_bigdata.sh
+++ b/simulations/simulations_bigdata.sh
@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Catch termination signal `SIGINT` and invoke `user_interupt`
+trap user_interupt SIGINT
+
+# Reports an user interupt and exits the simulation script (do not continue next
+# statement, allows to exit bash loop with `^C`)
+user_interupt() {
+    echo -e '\nUser Interrupt -> stopping simulation\n'
+    exit
+}
+
+# Simulation for big data with `p` proportional to `sqrt(n)`
+for ds in 6 8; do
+    command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=1000 --p=32 --epochs=200,400"
+    echo -e "\n$command"
+    time eval "$command"
+    command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=4000 --p=63 --epochs=100,200"
+    echo -e "\n$command"
+    time eval "$command"
+    command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=16000 --p=126 --epochs=50,100"
+    echo -e "\n$command"
+    time eval "$command"
+    command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=64000 --p=253 --epochs=25,50"
+    echo -e "\n$command"
+    time eval "$command"
+    command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=256000 --p=506 --epochs=12,25"
+    echo -e "\n$command"
+    time eval "$command"
+done
+
+# Simulation for big data with `p` proportional to `n` (note: for the base case
+# of `n = 1000`, `p = 32` see above)
+# For i = 1, ..., 4 the sample size `n = 1000 * 4^i`, number of predictors
+# `p = 32 * 4^i` and the training epochs `epochs ~ (200, 400) * 1.5^(-i)`
+for ds in 6 8; do
+    command="Rscript simulations_bigdata.R --reps=10 --run_mave=TRUE --run_cve=TRUE --dataset=$ds --n=4000 --p=128 --epochs=133,266"
+    echo -e "\n$command"
+    time eval "$command"
+    command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=16000 --p=512 --epochs=88,176"
+    echo -e "\n$command"
+    time eval "$command"
+    command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=64000 --p=2048 --epochs=59,118"
+    echo -e "\n$command"
+    time eval "$command"
+    command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=256000 --p=8192 --epochs=39,78"
+    echo -e "\n$command"
+    time eval "$command"
+done