diff --git a/.gitignore b/.gitignore index a2bebd3..b506cf5 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ NNSDR/man/ *.Rdata *.zip *.tar.gz +*.tar.xz *.BAK # LaTeX - build/database/... files diff --git a/NNSDR/R/datasets.R b/NNSDR/R/datasets.R index d8c0019..8f8bfc0 100644 --- a/NNSDR/R/datasets.R +++ b/NNSDR/R/datasets.R @@ -181,17 +181,7 @@ rlaplace <- function(n = 1, mu = 0, sd = 1) { #' \eqn{e_j} is the \eqn{j}-th unit vector in the \eqn{p}-dimensional space. #' \eqn{Y} is given as \deqn{Y = (b_1'X)^2+(b_2'X)^2+(b_3'X)^2+0.5\epsilon} #' where \eqn{\epsilon} is standard normal. -#' @section M7: -#' The predictors are distributed as \eqn{X\sim t_3(I_p)}{X~t_3(I_p)} where -#' \eqn{t_3(I_p)} is the standard multivariate t-distribution with 3 degrees of -#' freedom, for a subspace dimension of \eqn{k = 4} with a default of -#' \eqn{n = 200} data points. -#' \eqn{p = 20, b_1 = e_1, b_2 = e_2, b_3 = e_3}, and \eqn{b_4 = e_p}, where -#' \eqn{e_j} is the \eqn{j}-th unit vector in the \eqn{p}-dimensional space. -#' \eqn{Y} is given as \deqn{Y = (b_1'X)(b_2'X)^2+(b_3'X)(b_4'X)+0.5\epsilon} -#' where \eqn{\epsilon} is distributed as generalized normal distribution with -#' location 0, shape-parameter 1, and the scale-parameter is chosen such that -#' \eqn{Var(\epsilon) = 0.25}. +#' @section M7: see "Local Linear Forests" #' #' @import stats #' @importFrom stats rnorm rbinom @@ -253,18 +243,9 @@ dataset <- function(name = "M1", n = NULL, p = 20, sd = 0.5, ...) { X <- matrix(rnorm(n * p), n) Y <- rowSums((X %*% B)^2) + rnorm(n, 0, sd) } else if (name == "M7") { - if (missing(n)) { n <- 400 } - # B ... `p x 4` - B <- diag(p)[, -(4:(p - 1))] - # "R"andom "M"ulti"V"ariate "S"tudent - X <- rmvt(n = n, sigma = diag(p), df = 3) - XB <- X %*% B - Y <- (XB[, 1]) * (XB[, 2])^2 + (XB[, 3]) * (XB[, 4]) - Y <- Y + rlaplace(n, 0, sd) - } else if (name == "M8") { # see: "Local Linear Forests" if (missing(n)) { n <- 600 } - if (missing(p)) { p <- 20 } # 10 and 50 in "Local Linear Forests" + if (missing(p)) { p <- 20 } # 10 and 50 in "Local Linear Forests" if (missing(sd)) { sd <- 5 } # or 20 B <- diag(1, p, 4) @@ -274,11 +255,6 @@ dataset <- function(name = "M1", n = NULL, p = 20, sd = 0.5, ...) { XB <- X %*% B Y <- 10 * sin(pi * XB[, 1] * XB[, 2]) + 20 * (XB[, 3] - 0.5)^2 + 5 * XB[, 4] + rnorm(n, sd = sd) Y <- as.matrix(Y) - } else if (name == "M9") { - if (missing(n)) { n <- 300 } - X <- matrix(rnorm(n * p), n, p) - Y <- X[, 1] + (0.5 + X[, 2])^2 * rnorm(n) - B <- diag(1, p, 2) } else { stop("Got unknown dataset name.") } diff --git a/simulations/colin_test.R b/simulations/colin_test.R new file mode 100644 index 0000000..66bbdc2 --- /dev/null +++ b/simulations/colin_test.R @@ -0,0 +1,48 @@ +library(MAVE) +library(CVarE) +library(NNSDR) + +set.seed(797) + +dataset <- function(n = 100, p = 10, sd = 0.5) { + X <- matrix(rnorm(n * (p - 1)), n, p - 1) + X <- cbind(-0.5 * (X[, 1] + X[, 2]) + 0.001 * rnorm(n), X) + B <- diag(p)[, 4, drop = FALSE] + Y <- as.matrix(X[, 4]^2 + rnorm(n, 0, sd)) + return(list(X = X, Y = Y, B = B)) +} + +nn <- nnsdr$new( + input_shapes = list(x = 10L), + d = 1L, + hidden_units = 512L, + activation = 'relu', + trainable_reduction = TRUE +) + +sim <- data.frame(mave = rep(NA, 10), cve = NA, nn.opg = NA, nn.ref = NA) +for (i in 1:10) { + cat(i, "/ 10\n") + with(dataset(), { + dr <- mave.compute(X, Y, method = 'meanMAVE', max.dim = 1) + sim[i, 'mave'] <<- dist.subspace(B, coef(dr, 1), normalize = TRUE) + + dr <- cve.call(X, Y, k = 1) + sim[i, 'cve'] <<- dist.subspace(B, coef(dr, 1), normalize = TRUE) + + nn$fit(X, Y, epochs = c(200L, 400L), batch_size = 32L, initializer = 'fromOPG') + sim[i, 'nn.opg'] <<- dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE) + sim[i, 'nn.ref'] <<- dist.subspace(B, coef(nn), normalize = TRUE) + }) + nn$reset() +} +round(t(data.frame( + mean = colMeans(sim), + median = apply(sim, 2, median), + sd = apply(sim, 2, sd) +)), 3) + +# &$\mave$& $\cve$&$\nn_{512}$ \\ +# mean & 0.917 & 0.164 & 0.101 \\ +# median & 0.999 & 0.162 & 0.096 \\ +# sd & 0.256 & 0.057 & 0.032 \\ diff --git a/simulations/simulations.R b/simulations/simulations.R new file mode 100644 index 0000000..179517e --- /dev/null +++ b/simulations/simulations.R @@ -0,0 +1,98 @@ +#!/usr/bin/env Rscript + +library(MAVE) +library(CVarE) +Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings +suppressPackageStartupMessages({ + library(NNSDR) +}) + +## Parse script parameters +args <- parse.args(defaults = list( + # Simulation configuration + reps = 100, # Number of replications + dataset = '1', # Name (number) of the data set + # Neuronal Net. structure/definitions + hidden_units = 512L, + activation = 'relu', # or `relu` + trainable_reduction = TRUE, + # Neuronal Net. training + epochs = c(200L, 400L), # Number of training epochs for (`OPG`, Refinement) + batch_size = 32L, + initializer = 'fromOPG', + seed = 1390L +)) + +## Generate reference data (gets re-sampled for each replication) +ds <- dataset(args$dataset) # Generates a list with `X`, `Y`, `B` and `name` + +## Build Dimension Reduction Neuronal Network model (matching the data) +nn <- nnsdr$new( + input_shapes = list(x = ncol(ds$X)), + d = ncol(ds$B), + hidden_units = args$hidden_units, + activation = args$activation, + trainable_reduction = args$trainable_reduction +) + +## Open simulation log file, write simulation configuration and header +log <- file(format(Sys.time(), "results/sim_%Y%m%d_%H%M.csv"), "w", blocking = FALSE) +cat(paste('#', names(args), args, sep = ' ', collapse = '\n'), '\n', + 'method,replication,dist.subspace,dist.grassmann,mse\n', + sep = '', file = log, append = TRUE) + +## Set seed for sampling simulation data (NOT effecting the `NN` initialization) +set.seed(args$seed) + +## Repeated simulation runs +for (rep in seq_len(args$reps)) { + ## Re-sample seeded data for each simulation replication + with(dataset(ds$name), { + ## Sample test dataset + ds.test <- dataset(ds$name, n = 1000) + + ## First the reference method `MAVE` + dr <- mave(Y ~ X, method = "meanMAVE") + d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(dr, ncol(B))) + mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2) + cat('"mave",', rep, ',', d.sub, ',', d.gra, ',', mse, '\n', sep = '', + file = log, append = TRUE) + ## and the `OPG` method + dr <- mave(Y ~ X, method = "meanOPG") + d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(dr, ncol(B))) + mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2) + cat('"opg",', rep, ',', d.sub, ',', d.gra, ',', mse, '\n', sep = '', + file = log, append = TRUE) + + ## Next the `CVE` method + dr <- cve(Y ~ X, k = ncol(B)) + d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(dr, ncol(B))) + mse <- mean((predict(dr, ds.test$X, k = ncol(B)) - ds.test$Y)^2) + cat('"cve",', rep, ',', d.sub, ',', d.gra, ',', mse, '\n', sep = '', + file = log, append = TRUE) + + ## Fit `NNSDR` model + nn$fit(X, Y, epochs = args$epochs, + batch_size = args$batch_size, initializer = args$initializer) + # `OPG` estimate + d.sub <- dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(nn, 'OPG')) + cat('"nn.opg",', rep, ',', d.sub, ',', d.gra, ',', NA, '\n', sep = '', + file = log, append = TRUE) + # Refinement estimate + d.sub <- dist.subspace(B, coef(nn), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(nn)) + mse <- mean((nn$predict(ds.test$X) - ds.test$Y)^2) + cat('"nn.ref",', rep, ',', d.sub, ',', d.gra, ',', mse, '\n', sep = '', + file = log, append = TRUE) + }) + + ## Reset model + nn$reset() +} + +## Finished, close simulation log file +close(log) diff --git a/simulations/simulations_bigdata.R b/simulations/simulations_bigdata.R new file mode 100644 index 0000000..e05dfe6 --- /dev/null +++ b/simulations/simulations_bigdata.R @@ -0,0 +1,111 @@ +#!/usr/bin/env Rscript + +library(MAVE) +library(CVarE) +Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings +suppressPackageStartupMessages({ + library(NNSDR) +}) + +## Parse script parameters +args <- parse.args(defaults = list( + # Simulation configuration + reps = 10L, # Number of replications + dataset = '6', # Name (number) of the data set + # Neuronal Net. structure/definitions + hidden_units = 512L, + activation = 'relu', + trainable_reduction = TRUE, + # Neuronal Net. training + epochs = c(200L, 400L), # Number of training epochs for (`OPG`, Refinement) + batch_size = 32L, + initializer = 'fromOPG', + # Simulation data generation configuration + seed = 1390L, + n = 100L, + p = 20L +)) + +## Generate reference data (gets re-sampled for each replication) +# Number of observations are irrelevant for the reference to generate a matching +# `NNSDR` estimator +ds <- dataset(args$dataset, n = 100L, p = args$p) # Generates a list with `X`, `Y`, `B` and `name` + +## Build Dimension Reduction Neuronal Network model (matching the data) +nn <- nnsdr$new( + input_shapes = list(x = ncol(ds$X)), + d = ncol(ds$B), + hidden_units = args$hidden_units, + activation = args$activation, + trainable_reduction = args$trainable_reduction +) + +## Open simulation log file, write simulation configuration and header +log <- file(format(Sys.time(), "results/sim_big_%Y%m%d_%H%M.csv"), "w", blocking = FALSE) +cat(paste('#', names(args), args, sep = ' ', collapse = '\n'), '\n', + 'method,replication,dist.subspace,dist.grassmann,mse,time.user,time.system,time.elapsed\n', + sep = '', file = log, append = TRUE) + +## Repeated simulation runs +for (rep in seq_len(args$reps)) { + ## Re-sample seeded data for each simulation replication + with(dataset(ds$name, n = args$n, p = args$p), { + ## Sample test dataset + ds.test <- dataset(ds$name, n = 1000L, p = args$p) + + ## First the reference method `MAVE` + # To be fair for measuring the time, set `max.dim` to true reduction dimension + # and with `screen = ncol(X)` screening is turned "off". + time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B), + method = "meanMAVE", screen = ncol(X))) + d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(dr, ncol(B))) + mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2) + cat('"mave",', rep, ',', d.sub, ',', d.gra, ',', mse, ',', + time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n', + sep = '', file = log, append = TRUE) + ## and the `OPG` method + time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B), + method = "meanOPG", screen = ncol(X))) + d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(dr, ncol(B))) + mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2) + cat('"opg",', rep, ',', d.sub, ',', d.gra, ',', mse, ',', + time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n', + sep = '', file = log, append = TRUE) + + ## Next the CVE method + time <- system.time(dr <- cve.call(X, Y, k = ncol(B))) + d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(dr, ncol(B))) + mse <- mean((predict(dr, ds.test$X, k = ncol(B)) - ds.test$Y)^2) + cat('"cve",', rep, ',', d.sub, ',', d.gra, ',', mse, ',', + time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n', + sep = '', file = log, append = TRUE) + + ## Fit `DR` Neuronal Network model + time <- system.time(nn$fit(X, Y, epochs = args$epochs, + batch_size = args$batch_size, initializer = args$initializer)) + # OPG estimate + d.sub <- dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(nn, 'OPG')) + cat('"nn.opg",', rep, ',', d.sub, ',', d.gra, ',NA,NA,NA,NA\n', + sep = '', file = log, append = TRUE) + # Refinement estimate + d.sub <- dist.subspace(B, coef(nn), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(nn)) + mse <- mean((nn$predict(ds.test$X) - ds.test$Y)^2) + cat('"nn.ref",', rep, ',', d.sub, ',', d.gra, ',', mse, ',', + time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n', + sep = '', file = log, append = TRUE) + }) + + ## Invoke the garbage collector + gc() + + ## Reset model + nn$reset() +} + +## Finished, close simulation log file +close(log)