From a673b50c7a6ec85c571c3a0dfc8c790895edb056 Mon Sep 17 00:00:00 2001 From: daniel Date: Fri, 10 Sep 2021 14:58:47 +0200 Subject: [PATCH] add: custom metrics support and metric.subspace, wip: new binary response datasets, add: nn$evaluate, add: simulation_binary --- NNSDR/NAMESPACE | 1 + NNSDR/R/datasets.R | 48 +++++++++ NNSDR/R/metric_subspace.R | 58 ++++++++++ NNSDR/R/nnsdr_class.R | 26 ++++- real_data/MNIST.R | 163 ++++++++++++++++++++++++++++ simulations/edit_sim_data.R | 92 ++++++++++++++++ simulations/simulations.R | 5 +- simulations/simulations_bigdata.R | 4 +- simulations/simulations_bigdata.sh | 36 +++---- simulations/simulations_binary.R | 167 ++++++++++------------------- simulations/simulations_binary.sh | 24 +++++ 11 files changed, 493 insertions(+), 131 deletions(-) create mode 100644 NNSDR/R/metric_subspace.R create mode 100644 real_data/MNIST.R create mode 100644 simulations/edit_sim_data.R create mode 100644 simulations/simulations_binary.sh diff --git a/NNSDR/NAMESPACE b/NNSDR/NAMESPACE index 0dd3b71..815c72f 100644 --- a/NNSDR/NAMESPACE +++ b/NNSDR/NAMESPACE @@ -7,6 +7,7 @@ export(dist.grassmann) export(dist.mave) export(dist.subspace) export(get.script) +export(metric.subspace) export(nnsdr) export(parse.args) export(reinitialize_weights) diff --git a/NNSDR/R/datasets.R b/NNSDR/R/datasets.R index 8f8bfc0..88d8da7 100644 --- a/NNSDR/R/datasets.R +++ b/NNSDR/R/datasets.R @@ -255,6 +255,54 @@ dataset <- function(name = "M1", n = NULL, p = 20, sd = 0.5, ...) { XB <- X %*% B Y <- 10 * sin(pi * XB[, 1] * XB[, 2]) + 20 * (XB[, 3] - 0.5)^2 + 5 * XB[, 4] + rnorm(n, sd = sd) Y <- as.matrix(Y) + } else if (name == "B1") { + if (missing(n)) { n <- 400 } + + B <- diag(1, p, 1) + X <- matrix(runif(n * p, -2, 2), n, p) + Y <- (X %*% B + rnorm(n, 0, sd)) > 0 + } else if (name == "B2") { + if (missing(n)) { n <- 400 } + + B <- diag(1, p, 2) + Y <- as.matrix(sample(as.logical(0:1), n, replace = TRUE)) + X <- matrix(rnorm(n * p, sd = sd), n, p) + x1 <- sample(c(-1, 1), n, replace = TRUE) + x2 <- x1 * c(-1, 1)[Y + 1] + X[, 1] <- X[, 1] + x1 + X[, 2] <- X[, 2] + x2 + } else if (name == "B3") { + if (missing(n)) { n <- 1600 } + if (missing(sd)) { sd <- 0.1 } + + B <- diag(1, p, 3) + Y <- sample(as.logical(0:1), n, replace = TRUE) + x <- seq(0, 6 * pi, length.out = n) + X <- matrix(rnorm(n * p, sd = sd), n, p) + X[, 1] <- X[, 1] + 0.5 * sin(x + pi * Y) + X[, 2] <- X[, 2] + 0.5 * cos(x + pi * Y) + X[, 3] <- X[, 3] + (x - 3 * pi) / (3 * pi) + } else if (name == "B4") { + # Let + # X ~ N_p(0, I_p) + # then, + # Y | X ~ Binom(sin(atan2(Z_1, Z_2) + Z_3)^2) + # or in other words + # E(Y | X) = sin(atan2(Z_1, Z_2) + Z_3)^2 + + if (missing(n)) { n <- 1600 } + + B <- diag(1, p, 3) + X <- matrix(rnorm(n * p, sd = sd), n, p) + + # Angle and Height for each X with respect to the spiral manifold + XB <- X %*% B + phi <- atan2(XB[, 1], XB[, 2]) + h <- XB[, 3] + + prob <- sin(phi + h)^2 + Y <- apply(cbind(prob, 1 - prob), 1, sample, x = as.logical(0:1), + size = 1, replace = FALSE) } else { stop("Got unknown dataset name.") } diff --git a/NNSDR/R/metric_subspace.R b/NNSDR/R/metric_subspace.R new file mode 100644 index 0000000..4eaa46f --- /dev/null +++ b/NNSDR/R/metric_subspace.R @@ -0,0 +1,58 @@ +#' @export +metric.subspace <- function(B_true, + X = NULL, Y = NULL, + type = c("Refinement", "OPG"), + name = "metric.subspace", + normalize = FALSE +) { + type <- match.arg(type) + + if (!is.matrix(B_true)) + B_true <- as.matrix(B_true) + P_true <- B_true %*% solve(crossprod(B_true), t(B_true)) + P_true <- tf$constant(P_true, dtype = 'float32') + + if (normalize) { + rankSum <- 2 * ncol(B_true) + c <- 1 / sqrt(min(rankSum, 2 * nrow(B_true) - rankSum)) + } else { + c <- sqrt(2) + } + c <- tf$constant(c, dtype = 'float32') + + if (type == "Refinement") { + structure(function(model) { + B <- model$get_layer('reduction')$weights + function(y_true, y_pred) { + P <- tf$linalg$matmul(B, B, transpose_b = TRUE) + diff <- P_true - P + c * tf$sqrt(tf$reduce_sum(tf$math$multiply(diff, diff))) + } + }, + class = c("nnsdr.metric", "Refinement"), + name = name + ) + } else { + X <- tf$cast(X, dtype = 'float32') + begin <- tf$cast(c(0, nrow(B_true) - ncol(B_true) - 1), dtype = 'int32') + size <- tf$cast(c(nrow(B_true), ncol(B_true)), dtype = 'int32') + structure(function(model) { + function(y_true, y_pred) { + with(tf$GradientTape() %as% tape, { + tape$watch(X) + out <- model(X) + }) + G <- tape$gradient(out, X) + B <- tf$linalg$eigh(tf$linalg$matmul(G, G, transpose_a = TRUE)) + B <- tf$linalg$qr(tf$slice(B[[2]], begin, size))$q + + P <- tf$linalg$matmul(B, B, transpose_b = TRUE) + diff <- P_true - P + c * tf$sqrt(tf$reduce_sum(tf$math$multiply(diff, diff))) + } + }, + class = c("nnsdr.metric", "OPG"), + name = name + ) + } +} diff --git a/NNSDR/R/nnsdr_class.R b/NNSDR/R/nnsdr_class.R index fabf7f1..782b042 100644 --- a/NNSDR/R/nnsdr_class.R +++ b/NNSDR/R/nnsdr_class.R @@ -72,7 +72,7 @@ build.MLP <- function(input_shapes, d, name, add_reduction, if (!is.null(metrics)) { metrics <- as.list(metrics) - for (i in seq_along(metrics)) { + for (i in rev(seq_along(metrics))) { metric <- metrics[[i]] if (all(c("nnsdr.metric", name) %in% class(metric))) { metric_fn <- reticulate::py_func(metric(mlp)) @@ -275,6 +275,30 @@ nnsdr <- setRefClass('nnsdr', as.array(output) } }, + evaluate = function(inputs, output) { + if (is.list(inputs)) { + inputs <- Map(tf$cast, as.list(inputs), dtype = 'float32') + } else { + inputs <- list(tf$cast(inputs, dtype = 'float32')) + } + eval.opg <- .self$nn.opg$evaluate(inputs, output, + return_dict = TRUE, verbose = 0L) + + if (is.null(.self$history.ref)) + return(data.frame(eval.opg, row.names = "OPG")) + + eval.ref <- .self$nn.ref$evaluate(inputs, output, + return_dict = TRUE, verbose = 0L) + + # Convert to data.frame + eval.opg <- data.frame(eval.opg, row.names = "OPG") + eval.ref <- data.frame(eval.ref, row.names = "Refinement") + # Augment mutualy exclusive columns + eval.opg[setdiff(names(eval.ref), names(eval.opg))] <- NA + eval.ref[setdiff(names(eval.opg), names(eval.ref))] <- NA + # Combine/Bind + rbind(eval.opg, eval.ref) + }, coef = function(type = c('Refinement', 'OPG')) { type <- match.arg(type) if (type == 'Refinement') { diff --git a/real_data/MNIST.R b/real_data/MNIST.R new file mode 100644 index 0000000..88d37cf --- /dev/null +++ b/real_data/MNIST.R @@ -0,0 +1,163 @@ +library(keras) + +num_classes <- 10L +epochs <- 20L +batch_size <- 128L + +################################################################################ +### Loading & Prepair MNIST dataset ### +################################################################################ + +c(c(x_train, y_train), c(x_test, y_test)) %<-% dataset_mnist() + +x_train <- array_reshape(x_train, c(nrow(x_train), prod(dim(x_train)[-1]))) +x_test <- array_reshape(x_test , c(nrow(x_test ), prod(dim(x_test )[-1]))) + +# x_train <- x_train / 255 +# x_test <- x_test / 255 +center <- apply(x_train, 2, mean) +x_train <- (x_train - center) / 128 +x_test <- (x_test - center) / 128 + +y_train <- to_categorical(y_train, num_classes) +y_test <- to_categorical(y_test , num_classes) + +################################################################################ +### Model Creation ### +################################################################################ + +model <- keras_model_sequential(name = 'base_model') +model %>% + layer_dense(units = 256L, activation = 'relu', + input_shape = ncol(x_train)) %>% + layer_dropout(rate = 0.4) %>% + layer_dense(units = 128L, activation = 'relu') %>% + layer_dropout(rate = 0.3) %>% + layer_dense(units = num_classes, activation = 'softmax') + +summary(model) + +model %>% compile( + loss = 'categorical_crossentropy', + optimizer = 'RMSProp', + metrics = c('accuracy') +) + +################################################################################ +### Base Model Training ### +################################################################################ + +history.base <- model %>% fit( + x_train, y_train, + batch_size = batch_size, + epochs = epochs, + verbose = 1L, + validation_split = 0.1 +) + +plot(history.base) + +score <- model %>% evaluate(x_test, y_test, verbose = 0L) + +cat('Test loss: ', score[['loss']], '\n', + 'Test accuracy: ', score[['accuracy']], '\n', sep = '') + +################################################################################ +### OPG Data Reduction ### +################################################################################ +library(tensorflow) +library(ggplot2) + +G <- local({ + X = tf$cast(x_train, 'float32') + with(tf$GradientTape() %as% tape, { + tape$watch(X) + Y <- model(X) + }) + as.matrix(tape$gradient(Y, X)) +}) +eig <- eigen(var(G), symmetric = TRUE) +B.opg <- eig$vectors[, 1:2] + +# ggplot(data.frame(values = eig$values[1:25]), aes(x = seq_along(values), y = values)) + +# geom_line() + +ggplot(data.frame(x_test %*% B.opg, y = factor(apply(y_test, 1, which.max))), + aes(x = X1, y = X2, group = y, color = y)) + + geom_point() + +################################################################################ +### Refinement Model ### +################################################################################ +weights <- model$get_weights() + +model.ref <- keras_model_sequential(name = 'Refinement') +model.ref %>% + layer_dense(units = ncol(B.opg), activation = 'relu', + input_shape = ncol(x_train), use_bias = FALSE, + weights = list(B.opg)) %>% + layer_dense(units = 256L, activation = 'relu', + weights = list(crossprod(B.opg, weights[[1]]), weights[[2]])) %>% + layer_dropout(rate = 0.4) %>% + layer_dense(units = 128L, activation = 'relu', + weights = weights[3:4]) %>% + layer_dropout(rate = 0.3) %>% + layer_dense(units = num_classes, activation = 'softmax', + weights = weights[5:6]) + +summary(model.ref) + +model.ref %>% compile( + loss = 'categorical_crossentropy', + optimizer = 'RMSProp', + metrics = c('accuracy') +) + +################################################################################ +### Refinement Model Training ### +################################################################################ + +history.ref <- model.ref %>% fit( + x_train, y_train, + batch_size = batch_size, + epochs = epochs, + verbose = 1L, + validation_split = 0.1 +) + +plot(history.ref) + +score <- model.ref %>% evaluate(x_test, y_test, verbose = 0L) + +cat('Test loss: ', score[['loss']], '\n', + 'Test accuracy: ', score[['accuracy']], '\n', sep = '') + +### Combine Histories +hist <- structure(list( + params = list( + verbose = min(history.base$params$verbose, history.ref$params$verbose), + epochs = history.base$params$epochs + history.ref$params$epochs, + steps = max(history.base$params$steps, history.ref$params$steps) + ), + metrics = lapply(structure(names(history.base$metrics), names = names(history.base$metrics)), + function(name) c(history.base$metrics[[name]], history.ref$metrics[[name]])) +), class = "keras_training_history") + +plot(hist, smooth = FALSE) + + +B.ref <- model.ref$get_weights()[[1]] + +ggplot(data.frame(x_test %*% B.ref, y = factor(apply(y_test, 1, which.max))), + aes(x = X1, y = X2, group = y, color = y)) + + geom_point() + + +B.pca <- eigen(var(x_train), symmetric = TRUE)$vectors[, 1:2] +ggplot(data.frame(x_test %*% B.pca, y = factor(apply(y_test, 1, which.max))), + aes(x = X1, y = X2, group = y, color = y)) + + geom_point() + +image.ref <- matrix(((B.ref - min(B.ref)) / abs(diff(range(B.ref))))[, 2], 28, 28) +plot(c(0, 28), c(0, 28), type = "n", xlab = "", ylab = "") +rasterImage(image.ref, 0, 0, 28, 28, interpolate = TRUE) diff --git a/simulations/edit_sim_data.R b/simulations/edit_sim_data.R new file mode 100644 index 0000000..e6c69e2 --- /dev/null +++ b/simulations/edit_sim_data.R @@ -0,0 +1,92 @@ +library(ggplot2) + +################################################################################ +### General Helpers ### +################################################################################ + +# Read/Combine simulation data logs +read.logs <- function(pattern) { + # Folder containing log files + path <- if (file.exists('results/')) './' else './simulations/' + path <- paste0(path, 'results/') + # Read all log files and augment with meta-parameters + file.names <- list.files(path, pattern, full.names = TRUE) + sim <- do.call(rbind, lapply(file.names, function(path) { + # Read simulation log (one file) + sim <- read.csv(path, comment.char = '#') + # Add simulation arguments as columns + args <- Filter(function(line) startsWith(line, '#'), readLines(path)) + args <- sub('# ?', '', args) + args <- regmatches(args, regexpr(' ', args), invert = TRUE) + # Try to convert meta-parameters from string into int/num/bool/... + for (arg in args) { + val <- tryCatch( + eval(parse(text = arg[2])), + error = function(err) arg[2] + ) + if (length(val) > 1) val <- paste(val, collapse = ',') + sim[[arg[1]]] <- val + } + sim + })) + # Convert methods to factors + sim$method <- factor(sim$method, + levels = c("opg", "mave", "cve", "sir", "save", "phdy", "nn.opg", "nn.ref"), + labels = c("OPG", "MAVE", "CVE", "SIR", "SAVE", "PHD", "NN-OPG", "NN-Ref") + ) + sim +} + + +################################################################################ +### Bit Data ### +################################################################################ + +# Read/Combine big data simulation logs +sim <- read.logs('sim_big_.*csv') + +# Compute repetition mean and standard deviation over replications +(aggr <- merge( + aggregate(dist.subspace ~ dataset + n + p + method, sim, mean), + aggregate(dist.subspace ~ dataset + n + p + method, sim, sd), + by = c("dataset", "n", "p", "method"), + suffixes = c(".mean", ".sd") +)) + +# plots and tables +ggplot(aggr, aes(x = n, y = dist.subspace.mean, + group = interaction(dataset, method), + color = method, linetype = dataset)) + + geom_line() + + geom_errorbar(aes( + ymin = dist.subspace.mean - dist.subspace.sd, + ymax = dist.subspace.mean + dist.subspace.sd + ), width = 0.2) + + scale_x_continuous(trans = 'log2') + +################################################################################ +### Binary Response ### +################################################################################ + +# Read/Combine binary data simulation logs +sim <- read.logs('sim_binary_[0-9_]*\\.csv') + +# Aggregated Tables +aggr.formula <- cbind(dist.subspace, accuracy) ~ dataset + method +aggr <- merge( + aggregate(aggr.formula, sim, mean, + na.action = na.pass), + aggregate(aggr.formula, sim, sd, + na.action = na.pass), + by = attr(terms(aggr.formula), "term.labels"), + suffixes = c(".mean", ".sd") +) +print(aggr[with(aggr, order(dataset, dist.subspace.mean)), ], digits = 3) + + +# box-plot subspace comparison +ggplot(sim, aes(x = method, y = dist.subspace, + group = interaction(dataset, method), + color = dataset)) + + geom_boxplot() + + labs(title = "Sim. Binary", x = "Methods", y = "Subspace Dist.", color = "Datasets") diff --git a/simulations/simulations.R b/simulations/simulations.R index 4c595bf..4cc1477 100644 --- a/simulations/simulations.R +++ b/simulations/simulations.R @@ -11,7 +11,7 @@ suppressPackageStartupMessages({ args <- parse.args(defaults = list( # Simulation configuration reps = 100, # Number of replications - dataset = '1', # Name (number) of the data set + dataset = 'M1', # Name (number) of the data set # Neuronal Net. structure/definitions hidden_units = 512L, activation = 'relu', @@ -24,7 +24,8 @@ args <- parse.args(defaults = list( )) ## Generate reference data (gets re-sampled for each replication) -ds <- dataset(args$dataset) # Generates a list with `X`, `Y`, `B` and `name` +# Generates a list with `X`, `Y`, `B` and `name` +ds <- dataset(args$dataset, n = 100) ## Build Dimension Reduction Neuronal Network model (matching the data) nn <- nnsdr$new( diff --git a/simulations/simulations_bigdata.R b/simulations/simulations_bigdata.R index 757c1ce..9a6070c 100644 --- a/simulations/simulations_bigdata.R +++ b/simulations/simulations_bigdata.R @@ -11,7 +11,7 @@ suppressPackageStartupMessages({ args <- parse.args(defaults = list( # Simulation configuration reps = 10L, # Number of replications - dataset = '6', # Name (number) of the data set + dataset = 'M6', # Name (number) of the data set # Sets if reference methods shall be evaluated run_mave = TRUE, run_cve = TRUE, @@ -34,6 +34,8 @@ args <- parse.args(defaults = list( # Number of observations are irrelevant for the reference to generate a matching # `NNSDR` estimator ds <- dataset(args$dataset, n = 100L, p = args$p) # Generates a list with `X`, `Y`, `B` and `name` +# normalize dataset name (before written to the log/results file) +args$dataset <- ds$name ## Build Dimension Reduction Neuronal Network model (matching the data) nn <- nnsdr$new( diff --git a/simulations/simulations_bigdata.sh b/simulations/simulations_bigdata.sh index d0ffe07..0107f2d 100644 --- a/simulations/simulations_bigdata.sh +++ b/simulations/simulations_bigdata.sh @@ -10,24 +10,24 @@ user_interupt() { exit } -# Simulation for big data with `p` proportional to `sqrt(n)` -for ds in 6 8; do - command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=1000 --p=32 --epochs=200,400" - echo -e "\n$command" - time eval "$command" - command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=4000 --p=63 --epochs=100,200" - echo -e "\n$command" - time eval "$command" - command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=16000 --p=126 --epochs=50,100" - echo -e "\n$command" - time eval "$command" - command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=64000 --p=253 --epochs=25,50" - echo -e "\n$command" - time eval "$command" - command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=256000 --p=506 --epochs=12,25" - echo -e "\n$command" - time eval "$command" -done +# # Simulation for big data with `p` proportional to `sqrt(n)` +# for ds in 6 8; do +# command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=1000 --p=32 --epochs=200,400" +# echo -e "\n$command" +# time eval "$command" +# command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=4000 --p=63 --epochs=100,200" +# echo -e "\n$command" +# time eval "$command" +# command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=16000 --p=126 --epochs=50,100" +# echo -e "\n$command" +# time eval "$command" +# command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=64000 --p=253 --epochs=25,50" +# echo -e "\n$command" +# time eval "$command" +# command="Rscript simulations_bigdata.R --reps=10 --run_mave=FALSE --run_cve=FALSE --dataset=$ds --n=256000 --p=506 --epochs=12,25" +# echo -e "\n$command" +# time eval "$command" +# done # Simulation for big data with `p` proportional to `n` (note: for the base case # of `n = 1000`, `p = 32` see above) diff --git a/simulations/simulations_binary.R b/simulations/simulations_binary.R index 4a27c3f..a90f536 100644 --- a/simulations/simulations_binary.R +++ b/simulations/simulations_binary.R @@ -1,9 +1,8 @@ #!/usr/bin/env Rscript -library(MAVE) -library(CVarE) Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings suppressPackageStartupMessages({ + library(dr) library(NNSDR) }) @@ -11,97 +10,21 @@ suppressPackageStartupMessages({ args <- parse.args(defaults = list( # Simulation configuration reps = 100, # Number of replications - dataset = 'B2', # Name ('B' for Binary) of the data set + dataset = 'B1', # Name ('B' for Binary) of the data set # Neuronal Net. structure/definitions hidden_units = 512L, activation = 'relu', trainable_reduction = TRUE, # Neuronal Net. training - epochs = c(100L, 200L), # Number of training epochs for (`OPG`, Refinement) + epochs = c(3L, 5L), # Number of training epochs for (`OPG`, Refinement) batch_size = 32L, initializer = 'fromOPG', seed = 956294L )) -################################################################################ - -dataset <- function(name = "M1", n = NULL, p = 20, sd = 0.5, ...) { - name <- toupper(name) - if (nchar(name) == 1) { name <- paste0("M", name) } - - if (name == "B1") { - if (missing(n)) { n <- 400 } - if (missing(sd)) { sd <- 1 } - eps <- sqrt(.Machine$double.eps) - - B <- diag(1, p, 2) - Z <- matrix(rnorm(2 * n, 0, sd), n, 2) - Y <- sample(c(FALSE, TRUE), n, replace = TRUE) - theta <- rnorm(n, Y * pi, 0.25 * pi) - X <- cbind( - 10 * (cos(theta) + 0.75 * (2 * Y - 1)) + Z[, 1], - 10 * sin(theta) + Z[, 2], - matrix(rnorm(n * (p - 2), 0, sd), n) - ) - } else if (name == "B2") { - if (missing(n)) { n <- 400 } - if (missing(sd)) { sd <- 0.2 } - eps <- sqrt(.Machine$double.eps) - - B <- diag(1, p, 2) - X <- matrix(runif(n * p, -2, 2), n, p) - XB <- X %*% B - Y <- (sin(XB[, 1]) / (XB[, 2]^2 + eps) + rnorm(n, 0, sd)) > 0 - # } else if (name == "B2") { - # if (missing(n)) { n <- 400 } - # if (missing(sd)) { sd <- 0.2 } - # eps <- sqrt(.Machine$double.eps) - - # B <- diag(1, p, 2) - # X <- matrix(runif(n * p, -2, 2), n, p) - # XB <- X %*% B - # Y <- (sin(XB[, 1]) / (XB[, 2]^2 + eps) + rnorm(n, 0, sd)) > 0 - } else { - stop("Got unknown dataset name.") - } - - return(list(X = X, Y = as.matrix(Y), B = B, name = name)) -} - ## Generate reference data (gets re-sampled for each replication) -ds <- dataset(args$dataset, n = 100) # Generates a list with `X`, `Y`, `B` and `name` -# plot(ds$X %*% ds$B, col = 2 * (ds$Y + 1)) - -################################################################################ - -metric.subspace <- function(B_true, name = "metric.subspace", normalize = FALSE) { - if (!is.matrix(B_true)) - B_true <- as.matrix(B_true) - P_true <- B_true %*% solve(crossprod(B_true), t(B_true)) - P_true <- tf$constant(P_true, dtype = 'float32') - - if (normalize) { - rankSum <- 2 * ncol(B_true) - c <- 1 / sqrt(min(rankSum, 2 * nrow(B_true) - rankSum)) - } else { - c <- sqrt(2) - } - c <- tf$constant(c, dtype = 'float32') - - structure(function(model) { - B <- model$get_layer('reduction')$weights - function(y_true, y_pred) { - P <- tf$linalg$matmul(B, B, transpose_b = TRUE) - diff <- P_true - P - c * tf$sqrt(tf$reduce_sum(tf$math$multiply(diff, diff))) - } - }, - class = c("nnsdr.metric", "Refinement"), - name = name - ) -} - -ds <- dataset(args$dataset) +# Generates a list with `X`, `Y`, `B` and `name` +ds <- dataset(args$dataset, n = 1000) ## Build Dimension Reduction Neuronal Network model (matching the data) nn <- nnsdr$new( @@ -112,36 +35,62 @@ nn <- nnsdr$new( trainable_reduction = args$trainable_reduction, output_activation = 'sigmoid', loss = 'binary_crossentropy', - metrics = list('accuracy', metric.subspace(ds$B, normalize = TRUE)) # metrics = list('accuracy') + metrics = list( + 'accuracy', + metric.subspace(ds$B, ds$X, ds$Y, type = "OPG", normalize = TRUE), + metric.subspace(ds$B, type = "Refinement", normalize = TRUE) + ) ) -with(ds, { - ## Sample test dataset - ds.test <- dataset(ds$name, n = 1000) +## Open simulation log file, write simulation configuration and header +log <- file(format(Sys.time(), "results/sim_binary_%Y%m%d_%H%M.csv"), "w", blocking = FALSE) +cat(paste('#', names(args), args, sep = ' ', collapse = '\n'), '\n', + 'method,replication,dist.subspace,dist.grassmann,accuracy\n', + sep = '', file = log, append = TRUE) +## Set seed for sampling simulation data (NOT effecting the `NN` initialization) +set.seed(args$seed) + +## Repeated simulation runs +for (rep in seq_len(args$reps)) { + ## Re-sample seeded data for each simulation replication + with(dataset(ds$name), { + ## Sample test dataset + ds.test <- dataset(ds$name, n = 1000) + + ## Starting with the reference methods `SIR`, `SAVE` and `PHD` + for (method in c("sir", "save", "phdy")) { + fit <- dr(Y ~ X, method = method) + d.sub <- dist.subspace(B, dr.basis(fit, ncol(B)), normalize = TRUE) + d.gra <- dist.grassmann(B, dr.basis(fit, ncol(B))) + accuracy <- NA + cat('"', method, '",', rep, ',', d.sub, ',', d.gra, ',', accuracy, + '\n', sep = '', file = log, append = TRUE) + } + + ## Fit `NNSDR` model + nn$fit(X, Y, epochs = args$epochs, + batch_size = args$batch_size, initializer = args$initializer) + # Model evaluation (with metrics) + eval <- nn$evaluate(ds.test$X, ds.test$Y) + # `OPG` estimate + d.sub <- dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(nn, 'OPG')) + accuracy <- eval[["OPG", "accuracy"]] + cat('"nn.opg",', rep, ',', d.sub, ',', d.gra, ',', accuracy, + '\n', sep = '', file = log, append = TRUE) + # Refinement estimate + d.sub <- dist.subspace(B, coef(nn), normalize = TRUE) + d.gra <- dist.grassmann(B, coef(nn)) + accuracy <- eval[["Refinement", "accuracy"]] + cat('"nn.ref",', rep, ',', d.sub, ',', d.gra, ',', accuracy, + '\n', sep = '', file = log, append = TRUE) + }) + + ## Reset model nn$reset() - nn$fit(X, Y, epochs = args$epochs, - batch_size = args$batch_size, initializer = args$initializer, - verbose = 2L) +} - # `OPG` estimate - cat("OPG subspace: ", dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE), '\n') - cat("OPG grassmann:", dist.grassmann(B, coef(nn, 'OPG')), '\n') - # Refinement estimate - cat("Ref subspace: ", dist.subspace(B, coef(nn), normalize = TRUE), '\n') - cat("Ref grassmann:", dist.grassmann(B, coef(nn)), '\n') - # MSE - cat("MSE: ", mean((nn$predict(ds.test$X) - ds.test$Y)^2), '\n') -}) - -library(ggplot2) - -ggplot(nn$history, aes(x = epoch)) + - geom_line(aes(y = loss), col = 'red') + - geom_line(aes(y = accuracy), col = 'blue') - -with(dataset('B2', n = 400), { - ggplot(data.frame(XB1 = (X %*% B)[, 1], XB2 = (X %*% B)[, 2], Y = Y)) + - geom_point(aes(x = XB1, y = XB2, col = Y)) -}) +## Finished, close simulation log file +close(log) diff --git a/simulations/simulations_binary.sh b/simulations/simulations_binary.sh new file mode 100644 index 0000000..494e6e9 --- /dev/null +++ b/simulations/simulations_binary.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Catch termination signal `SIGINT` and invoke `user_interupt` +trap user_interupt SIGINT + +# Reports an user interupt and exits the simulation script (do not continue next +# statement, allows to exit bash loop with `^C`) +user_interupt() { + echo -e '\nUser Interrupt -> stopping simulation\n' + exit +} + +command="Rscript simulations_binary.R --reps=100 --dataset=B1 --hidden_units=1024 --epochs=5,10" +echo -e "\n$command" +time eval "$command" +command="Rscript simulations_binary.R --reps=100 --dataset=B2" +echo -e "\n$command" +time eval "$command" +command="Rscript simulations_binary.R --reps=100 --dataset=B3 --hidden_units=1024,128,32 --epochs=10,200" +echo -e "\n$command" +time eval "$command" +command="Rscript simulations_binary.R --reps=100 --dataset=B4 --hidden_units=1024,128,32 --epochs=10,200" +echo -e "\n$command" +time eval "$command"