add: big data simulation with p growing proportional to n
This commit is contained in:
parent
0214823794
commit
69a008535b
|
@ -14,7 +14,7 @@ args <- parse.args(defaults = list(
|
||||||
dataset = '1', # Name (number) of the data set
|
dataset = '1', # Name (number) of the data set
|
||||||
# Neuronal Net. structure/definitions
|
# Neuronal Net. structure/definitions
|
||||||
hidden_units = 512L,
|
hidden_units = 512L,
|
||||||
activation = 'relu', # or `relu`
|
activation = 'relu',
|
||||||
trainable_reduction = TRUE,
|
trainable_reduction = TRUE,
|
||||||
# Neuronal Net. training
|
# Neuronal Net. training
|
||||||
epochs = c(200L, 400L), # Number of training epochs for (`OPG`, Refinement)
|
epochs = c(200L, 400L), # Number of training epochs for (`OPG`, Refinement)
|
||||||
|
|
|
@ -12,6 +12,10 @@ args <- parse.args(defaults = list(
|
||||||
# Simulation configuration
|
# Simulation configuration
|
||||||
reps = 10L, # Number of replications
|
reps = 10L, # Number of replications
|
||||||
dataset = '6', # Name (number) of the data set
|
dataset = '6', # Name (number) of the data set
|
||||||
|
# Sets if reference methods shall be evaluated
|
||||||
|
run_mave = TRUE,
|
||||||
|
run_cve = TRUE,
|
||||||
|
run_nn = TRUE,
|
||||||
# Neuronal Net. structure/definitions
|
# Neuronal Net. structure/definitions
|
||||||
hidden_units = 512L,
|
hidden_units = 512L,
|
||||||
activation = 'relu',
|
activation = 'relu',
|
||||||
|
@ -34,7 +38,7 @@ ds <- dataset(args$dataset, n = 100L, p = args$p) # Generates a list with `X`, `
|
||||||
## Build Dimension Reduction Neuronal Network model (matching the data)
|
## Build Dimension Reduction Neuronal Network model (matching the data)
|
||||||
nn <- nnsdr$new(
|
nn <- nnsdr$new(
|
||||||
input_shapes = list(x = ncol(ds$X)),
|
input_shapes = list(x = ncol(ds$X)),
|
||||||
d = ncol(ds$B),
|
d = ncol(ds$B), # depends on the dataset type
|
||||||
hidden_units = args$hidden_units,
|
hidden_units = args$hidden_units,
|
||||||
activation = args$activation,
|
activation = args$activation,
|
||||||
trainable_reduction = args$trainable_reduction
|
trainable_reduction = args$trainable_reduction
|
||||||
|
@ -53,17 +57,18 @@ for (rep in seq_len(args$reps)) {
|
||||||
## Sample test dataset
|
## Sample test dataset
|
||||||
ds.test <- dataset(ds$name, n = 1000L, p = args$p)
|
ds.test <- dataset(ds$name, n = 1000L, p = args$p)
|
||||||
|
|
||||||
|
if (args$run_mave) {
|
||||||
## First the reference method `MAVE`
|
## First the reference method `MAVE`
|
||||||
# To be fair for measuring the time, set `max.dim` to true reduction dimension
|
# To be fair for measuring the time, set `max.dim` to true reduction
|
||||||
# and with `screen = ncol(X)` screening is turned "off".
|
# dimension and with `screen = ncol(X)` screening is turned "off".
|
||||||
time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B),
|
time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B),
|
||||||
method = "meanMAVE", screen = ncol(X)))
|
method = "meanMAVE", screen = ncol(X)))
|
||||||
d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
|
d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
|
||||||
d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
|
d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
|
||||||
mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
|
mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
|
||||||
cat('"mave",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
|
cat('"mave",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
|
||||||
time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
|
time['user.self'], ',', time['sys.self'], ',', time['elapsed'],
|
||||||
sep = '', file = log, append = TRUE)
|
'\n', sep = '', file = log, append = TRUE)
|
||||||
## and the `OPG` method
|
## and the `OPG` method
|
||||||
time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B),
|
time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B),
|
||||||
method = "meanOPG", screen = ncol(X)))
|
method = "meanOPG", screen = ncol(X)))
|
||||||
|
@ -71,18 +76,22 @@ for (rep in seq_len(args$reps)) {
|
||||||
d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
|
d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
|
||||||
mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
|
mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
|
||||||
cat('"opg",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
|
cat('"opg",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
|
||||||
time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
|
time['user.self'], ',', time['sys.self'], ',', time['elapsed'],
|
||||||
sep = '', file = log, append = TRUE)
|
'\n', sep = '', file = log, append = TRUE)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (args$run_cve) {
|
||||||
## Next the CVE method
|
## Next the CVE method
|
||||||
time <- system.time(dr <- cve.call(X, Y, k = ncol(B)))
|
time <- system.time(dr <- cve.call(X, Y, k = ncol(B)))
|
||||||
d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
|
d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
|
||||||
d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
|
d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
|
||||||
mse <- mean((predict(dr, ds.test$X, k = ncol(B)) - ds.test$Y)^2)
|
mse <- mean((predict(dr, ds.test$X, k = ncol(B)) - ds.test$Y)^2)
|
||||||
cat('"cve",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
|
cat('"cve",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
|
||||||
time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
|
time['user.self'], ',', time['sys.self'], ',', time['elapsed'],
|
||||||
sep = '', file = log, append = TRUE)
|
'\n', sep = '', file = log, append = TRUE)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (args$run_nn) {
|
||||||
## Fit `DR` Neuronal Network model
|
## Fit `DR` Neuronal Network model
|
||||||
time <- system.time(nn$fit(X, Y, epochs = args$epochs,
|
time <- system.time(nn$fit(X, Y, epochs = args$epochs,
|
||||||
batch_size = args$batch_size, initializer = args$initializer))
|
batch_size = args$batch_size, initializer = args$initializer))
|
||||||
|
@ -96,8 +105,9 @@ for (rep in seq_len(args$reps)) {
|
||||||
d.gra <- dist.grassmann(B, coef(nn))
|
d.gra <- dist.grassmann(B, coef(nn))
|
||||||
mse <- mean((nn$predict(ds.test$X) - ds.test$Y)^2)
|
mse <- mean((nn$predict(ds.test$X) - ds.test$Y)^2)
|
||||||
cat('"nn.ref",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
|
cat('"nn.ref",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
|
||||||
time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
|
time['user.self'], ',', time['sys.self'], ',', time['elapsed'],
|
||||||
sep = '', file = log, append = TRUE)
|
'\n', sep = '', file = log, append = TRUE)
|
||||||
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
## Invoke the garbage collector
|
## Invoke the garbage collector
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Catch termination signal `SIGINT` and invoke `user_interupt`
|
||||||
|
trap user_interupt SIGINT
|
||||||
|
|
||||||
|
# Reports an user interupt and exits the simulation script (do not continue next
|
||||||
|
# statement, allows to exit bash loop with `^C`)
|
||||||
|
user_interupt() {
|
||||||
|
echo -e '\nUser Interrupt -> stopping simulation\n'
|
||||||
|
exit
|
||||||
|
}
|
||||||
|
|
||||||
|
# Simulation for big data with `p` proportional to `sqrt(n)`
|
||||||
|
for ds in 6 8; do
|
||||||
|
command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=1000 --p=32 --epochs=200,400"
|
||||||
|
echo -e "\n$command"
|
||||||
|
time eval "$command"
|
||||||
|
command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=4000 --p=63 --epochs=100,200"
|
||||||
|
echo -e "\n$command"
|
||||||
|
time eval "$command"
|
||||||
|
command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=16000 --p=126 --epochs=50,100"
|
||||||
|
echo -e "\n$command"
|
||||||
|
time eval "$command"
|
||||||
|
command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=64000 --p=253 --epochs=25,50"
|
||||||
|
echo -e "\n$command"
|
||||||
|
time eval "$command"
|
||||||
|
command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=256000 --p=506 --epochs=12,25"
|
||||||
|
echo -e "\n$command"
|
||||||
|
time eval "$command"
|
||||||
|
done
|
||||||
|
|
||||||
|
# Simulation for big data with `p` proportional to `n` (note: for the base case
|
||||||
|
# of `n = 1000`, `p = 32` see above)
|
||||||
|
# For i = 1, ..., 4 the sample size `n = 1000 * 4^i`, number of predictors
|
||||||
|
# `p = 32 * 4^i` and the training epochs `epochs ~ (200, 400) * 1.5^(-i)`
|
||||||
|
for ds in 6 8; do
|
||||||
|
command="Rscript simulations_bigdata.R --reps=10 --run_mave=TRUE --run_cve=TRUE --dataset=$ds --n=4000 --p=128 --epochs=133,266"
|
||||||
|
echo -e "\n$command"
|
||||||
|
time eval "$command"
|
||||||
|
command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=16000 --p=512 --epochs=88,176"
|
||||||
|
echo -e "\n$command"
|
||||||
|
time eval "$command"
|
||||||
|
command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=64000 --p=2048 --epochs=59,118"
|
||||||
|
echo -e "\n$command"
|
||||||
|
time eval "$command"
|
||||||
|
command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=256000 --p=8192 --epochs=39,78"
|
||||||
|
echo -e "\n$command"
|
||||||
|
time eval "$command"
|
||||||
|
done
|
Loading…
Reference in New Issue