From ffd02744a852763de011d7a70ca7b80aeebeaca6 Mon Sep 17 00:00:00 2001 From: daniel Date: Tue, 7 Sep 2021 13:00:52 +0200 Subject: [PATCH] add: binary simulation R script --- simulations/simulations_binary.R | 147 +++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 simulations/simulations_binary.R diff --git a/simulations/simulations_binary.R b/simulations/simulations_binary.R new file mode 100644 index 0000000..4a27c3f --- /dev/null +++ b/simulations/simulations_binary.R @@ -0,0 +1,147 @@ +#!/usr/bin/env Rscript + +library(MAVE) +library(CVarE) +Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings +suppressPackageStartupMessages({ + library(NNSDR) +}) + +## Parse script parameters +args <- parse.args(defaults = list( + # Simulation configuration + reps = 100, # Number of replications + dataset = 'B2', # Name ('B' for Binary) of the data set + # Neuronal Net. structure/definitions + hidden_units = 512L, + activation = 'relu', + trainable_reduction = TRUE, + # Neuronal Net. training + epochs = c(100L, 200L), # Number of training epochs for (`OPG`, Refinement) + batch_size = 32L, + initializer = 'fromOPG', + seed = 956294L +)) + +################################################################################ + +dataset <- function(name = "M1", n = NULL, p = 20, sd = 0.5, ...) { + name <- toupper(name) + if (nchar(name) == 1) { name <- paste0("M", name) } + + if (name == "B1") { + if (missing(n)) { n <- 400 } + if (missing(sd)) { sd <- 1 } + eps <- sqrt(.Machine$double.eps) + + B <- diag(1, p, 2) + Z <- matrix(rnorm(2 * n, 0, sd), n, 2) + Y <- sample(c(FALSE, TRUE), n, replace = TRUE) + theta <- rnorm(n, Y * pi, 0.25 * pi) + X <- cbind( + 10 * (cos(theta) + 0.75 * (2 * Y - 1)) + Z[, 1], + 10 * sin(theta) + Z[, 2], + matrix(rnorm(n * (p - 2), 0, sd), n) + ) + } else if (name == "B2") { + if (missing(n)) { n <- 400 } + if (missing(sd)) { sd <- 0.2 } + eps <- sqrt(.Machine$double.eps) + + B <- diag(1, p, 2) + X <- matrix(runif(n * p, -2, 2), n, p) + XB <- X %*% B + Y <- (sin(XB[, 1]) / (XB[, 2]^2 + eps) + rnorm(n, 0, sd)) > 0 + # } else if (name == "B2") { + # if (missing(n)) { n <- 400 } + # if (missing(sd)) { sd <- 0.2 } + # eps <- sqrt(.Machine$double.eps) + + # B <- diag(1, p, 2) + # X <- matrix(runif(n * p, -2, 2), n, p) + # XB <- X %*% B + # Y <- (sin(XB[, 1]) / (XB[, 2]^2 + eps) + rnorm(n, 0, sd)) > 0 + } else { + stop("Got unknown dataset name.") + } + + return(list(X = X, Y = as.matrix(Y), B = B, name = name)) +} + +## Generate reference data (gets re-sampled for each replication) +ds <- dataset(args$dataset, n = 100) # Generates a list with `X`, `Y`, `B` and `name` +# plot(ds$X %*% ds$B, col = 2 * (ds$Y + 1)) + +################################################################################ + +metric.subspace <- function(B_true, name = "metric.subspace", normalize = FALSE) { + if (!is.matrix(B_true)) + B_true <- as.matrix(B_true) + P_true <- B_true %*% solve(crossprod(B_true), t(B_true)) + P_true <- tf$constant(P_true, dtype = 'float32') + + if (normalize) { + rankSum <- 2 * ncol(B_true) + c <- 1 / sqrt(min(rankSum, 2 * nrow(B_true) - rankSum)) + } else { + c <- sqrt(2) + } + c <- tf$constant(c, dtype = 'float32') + + structure(function(model) { + B <- model$get_layer('reduction')$weights + function(y_true, y_pred) { + P <- tf$linalg$matmul(B, B, transpose_b = TRUE) + diff <- P_true - P + c * tf$sqrt(tf$reduce_sum(tf$math$multiply(diff, diff))) + } + }, + class = c("nnsdr.metric", "Refinement"), + name = name + ) +} + +ds <- dataset(args$dataset) + +## Build Dimension Reduction Neuronal Network model (matching the data) +nn <- nnsdr$new( + input_shapes = list(x = ncol(ds$X)), + d = ncol(ds$B), + hidden_units = args$hidden_units, + activation = args$activation, + trainable_reduction = args$trainable_reduction, + output_activation = 'sigmoid', + loss = 'binary_crossentropy', + metrics = list('accuracy', metric.subspace(ds$B, normalize = TRUE)) + # metrics = list('accuracy') +) + +with(ds, { + ## Sample test dataset + ds.test <- dataset(ds$name, n = 1000) + + nn$reset() + nn$fit(X, Y, epochs = args$epochs, + batch_size = args$batch_size, initializer = args$initializer, + verbose = 2L) + + # `OPG` estimate + cat("OPG subspace: ", dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE), '\n') + cat("OPG grassmann:", dist.grassmann(B, coef(nn, 'OPG')), '\n') + # Refinement estimate + cat("Ref subspace: ", dist.subspace(B, coef(nn), normalize = TRUE), '\n') + cat("Ref grassmann:", dist.grassmann(B, coef(nn)), '\n') + # MSE + cat("MSE: ", mean((nn$predict(ds.test$X) - ds.test$Y)^2), '\n') +}) + +library(ggplot2) + +ggplot(nn$history, aes(x = epoch)) + + geom_line(aes(y = loss), col = 'red') + + geom_line(aes(y = accuracy), col = 'blue') + +with(dataset('B2', n = 400), { + ggplot(data.frame(XB1 = (X %*% B)[, 1], XB2 = (X %*% B)[, 2], Y = Y)) + + geom_point(aes(x = XB1, y = XB2, col = Y)) +})