From ffd02744a852763de011d7a70ca7b80aeebeaca6 Mon Sep 17 00:00:00 2001
From: daniel <daniel@kapla.at>
Date: Tue, 7 Sep 2021 13:00:52 +0200
Subject: [PATCH] add: binary simulation R script

---
 simulations/simulations_binary.R | 147 +++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 simulations/simulations_binary.R

diff --git a/simulations/simulations_binary.R b/simulations/simulations_binary.R
new file mode 100644
index 0000000..4a27c3f
--- /dev/null
+++ b/simulations/simulations_binary.R
@@ -0,0 +1,147 @@
+#!/usr/bin/env Rscript
+
+library(MAVE)
+library(CVarE)
+Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings
+suppressPackageStartupMessages({
+    library(NNSDR)
+})
+
+## Parse script parameters
+args <- parse.args(defaults = list(
+    # Simulation configuration
+    reps = 100,             # Number of replications
+    dataset = 'B2',         # Name ('B' for Binary) of the data set
+    # Neuronal Net. structure/definitions
+    hidden_units = 512L,
+    activation = 'relu',
+    trainable_reduction = TRUE,
+    # Neuronal Net. training
+    epochs = c(100L, 200L), # Number of training epochs for (`OPG`, Refinement)
+    batch_size = 32L,
+    initializer = 'fromOPG',
+    seed = 956294L
+))
+
+################################################################################
+
+dataset <- function(name = "M1", n = NULL, p = 20, sd = 0.5, ...) {
+    name <- toupper(name)
+    if (nchar(name) == 1) { name <- paste0("M", name) }
+
+    if (name == "B1") {
+        if (missing(n)) { n <- 400 }
+        if (missing(sd)) { sd <- 1 }
+        eps <- sqrt(.Machine$double.eps)
+
+        B <- diag(1, p, 2)
+        Z <- matrix(rnorm(2 * n, 0, sd), n, 2)
+        Y <- sample(c(FALSE, TRUE), n, replace = TRUE)
+        theta <- rnorm(n, Y * pi, 0.25 * pi)
+        X <- cbind(
+            10 * (cos(theta) + 0.75 * (2 * Y - 1)) + Z[, 1],
+            10 * sin(theta) + Z[, 2],
+            matrix(rnorm(n * (p - 2), 0, sd), n)
+        )
+    } else if (name == "B2") {
+        if (missing(n)) { n <- 400 }
+        if (missing(sd)) { sd <- 0.2 }
+        eps <- sqrt(.Machine$double.eps)
+
+        B <- diag(1, p, 2)
+        X <- matrix(runif(n * p, -2, 2), n, p)
+        XB <- X %*% B
+        Y <- (sin(XB[, 1]) / (XB[, 2]^2 + eps) + rnorm(n, 0, sd)) > 0
+    # } else if (name == "B2") {
+    #     if (missing(n)) { n <- 400 }
+    #     if (missing(sd)) { sd <- 0.2 }
+    #     eps <- sqrt(.Machine$double.eps)
+
+    #     B <- diag(1, p, 2)
+    #     X <- matrix(runif(n * p, -2, 2), n, p)
+    #     XB <- X %*% B
+    #     Y <- (sin(XB[, 1]) / (XB[, 2]^2 + eps) + rnorm(n, 0, sd)) > 0
+    } else {
+        stop("Got unknown dataset name.")
+    }
+
+    return(list(X = X, Y = as.matrix(Y), B = B, name = name))
+}
+
+## Generate reference data (gets re-sampled for each replication)
+ds <- dataset(args$dataset, n = 100) # Generates a list with `X`, `Y`, `B` and `name`
+# plot(ds$X %*% ds$B, col = 2 * (ds$Y + 1))
+
+################################################################################
+
+metric.subspace <- function(B_true, name = "metric.subspace", normalize = FALSE) {
+    if (!is.matrix(B_true))
+        B_true <- as.matrix(B_true)
+    P_true <- B_true %*% solve(crossprod(B_true), t(B_true))
+    P_true <- tf$constant(P_true, dtype = 'float32')
+
+    if (normalize) {
+        rankSum <- 2 * ncol(B_true)
+        c <- 1 / sqrt(min(rankSum, 2 * nrow(B_true) - rankSum))
+    } else {
+        c <- sqrt(2)
+    }
+    c <- tf$constant(c, dtype = 'float32')
+
+    structure(function(model) {
+            B <- model$get_layer('reduction')$weights
+            function(y_true, y_pred) {
+                P <- tf$linalg$matmul(B, B, transpose_b = TRUE)
+                diff <- P_true - P
+                c * tf$sqrt(tf$reduce_sum(tf$math$multiply(diff, diff)))
+            }
+        },
+        class = c("nnsdr.metric", "Refinement"),
+        name = name
+    )
+}
+
+ds <- dataset(args$dataset)
+
+## Build Dimension Reduction Neuronal Network model (matching the data)
+nn <- nnsdr$new(
+    input_shapes = list(x = ncol(ds$X)),
+    d = ncol(ds$B),
+    hidden_units = args$hidden_units,
+    activation = args$activation,
+    trainable_reduction = args$trainable_reduction,
+    output_activation = 'sigmoid',
+    loss = 'binary_crossentropy',
+    metrics = list('accuracy', metric.subspace(ds$B, normalize = TRUE))
+    # metrics = list('accuracy')
+)
+
+with(ds, {
+    ## Sample test dataset
+    ds.test <- dataset(ds$name, n = 1000)
+
+    nn$reset()
+    nn$fit(X, Y, epochs = args$epochs,
+           batch_size = args$batch_size, initializer = args$initializer,
+           verbose = 2L)
+
+    # `OPG` estimate
+    cat("OPG subspace: ", dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE), '\n')
+    cat("OPG grassmann:", dist.grassmann(B, coef(nn, 'OPG')), '\n')
+    # Refinement estimate
+    cat("Ref subspace: ", dist.subspace(B, coef(nn), normalize = TRUE), '\n')
+    cat("Ref grassmann:", dist.grassmann(B, coef(nn)), '\n')
+    # MSE
+    cat("MSE: ", mean((nn$predict(ds.test$X) - ds.test$Y)^2), '\n')
+})
+
+library(ggplot2)
+
+ggplot(nn$history, aes(x = epoch)) +
+    geom_line(aes(y = loss), col = 'red') +
+    geom_line(aes(y = accuracy), col = 'blue')
+
+with(dataset('B2', n = 400), {
+    ggplot(data.frame(XB1 = (X %*% B)[, 1], XB2 = (X %*% B)[, 2], Y = Y)) +
+        geom_point(aes(x = XB1, y = XB2, col = Y))
+})