NNSDR/simulations/simulations_binary.R

#!/usr/bin/env Rscript

library(MAVE)
library(CVarE)
Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings
suppressPackageStartupMessages({
    library(NNSDR)
})

## Parse script parameters
args <- parse.args(defaults = list(
    # Simulation configuration
    reps = 100,             # Number of replications
    dataset = 'B2',         # Name ('B' for Binary) of the data set
    # Neuronal Net. structure/definitions
    hidden_units = 512L,
    activation = 'relu',
    trainable_reduction = TRUE,
    # Neuronal Net. training
    epochs = c(100L, 200L), # Number of training epochs for (`OPG`, Refinement)
    batch_size = 32L,
    initializer = 'fromOPG',
    seed = 956294L
))

################################################################################

dataset <- function(name = "M1", n = NULL, p = 20, sd = 0.5, ...) {
    name <- toupper(name)
    if (nchar(name) == 1) { name <- paste0("M", name) }

    if (name == "B1") {
        if (missing(n)) { n <- 400 }
        if (missing(sd)) { sd <- 1 }
        eps <- sqrt(.Machine$double.eps)

        B <- diag(1, p, 2)
        Z <- matrix(rnorm(2 * n, 0, sd), n, 2)
        Y <- sample(c(FALSE, TRUE), n, replace = TRUE)
        theta <- rnorm(n, Y * pi, 0.25 * pi)
        X <- cbind(
            10 * (cos(theta) + 0.75 * (2 * Y - 1)) + Z[, 1],
            10 * sin(theta) + Z[, 2],
            matrix(rnorm(n * (p - 2), 0, sd), n)
        )
    } else if (name == "B2") {
        if (missing(n)) { n <- 400 }
        if (missing(sd)) { sd <- 0.2 }
        eps <- sqrt(.Machine$double.eps)

        B <- diag(1, p, 2)
        X <- matrix(runif(n * p, -2, 2), n, p)
        XB <- X %*% B
        Y <- (sin(XB[, 1]) / (XB[, 2]^2 + eps) + rnorm(n, 0, sd)) > 0
    # } else if (name == "B2") {
    #     if (missing(n)) { n <- 400 }
    #     if (missing(sd)) { sd <- 0.2 }
    #     eps <- sqrt(.Machine$double.eps)

    #     B <- diag(1, p, 2)
    #     X <- matrix(runif(n * p, -2, 2), n, p)
    #     XB <- X %*% B
    #     Y <- (sin(XB[, 1]) / (XB[, 2]^2 + eps) + rnorm(n, 0, sd)) > 0
    } else {
        stop("Got unknown dataset name.")
    }

    return(list(X = X, Y = as.matrix(Y), B = B, name = name))
}

## Generate reference data (gets re-sampled for each replication)
ds <- dataset(args$dataset, n = 100) # Generates a list with `X`, `Y`, `B` and `name`
# plot(ds$X %*% ds$B, col = 2 * (ds$Y + 1))

################################################################################

metric.subspace <- function(B_true, name = "metric.subspace", normalize = FALSE) {
    if (!is.matrix(B_true))
        B_true <- as.matrix(B_true)
    P_true <- B_true %*% solve(crossprod(B_true), t(B_true))
    P_true <- tf$constant(P_true, dtype = 'float32')

    if (normalize) {
        rankSum <- 2 * ncol(B_true)
        c <- 1 / sqrt(min(rankSum, 2 * nrow(B_true) - rankSum))
    } else {
        c <- sqrt(2)
    }
    c <- tf$constant(c, dtype = 'float32')

    structure(function(model) {
            B <- model$get_layer('reduction')$weights
            function(y_true, y_pred) {
                P <- tf$linalg$matmul(B, B, transpose_b = TRUE)
                diff <- P_true - P
                c * tf$sqrt(tf$reduce_sum(tf$math$multiply(diff, diff)))
            }
        },
        class = c("nnsdr.metric", "Refinement"),
        name = name
    )
}

ds <- dataset(args$dataset)

## Build Dimension Reduction Neuronal Network model (matching the data)
nn <- nnsdr$new(
    input_shapes = list(x = ncol(ds$X)),
    d = ncol(ds$B),
    hidden_units = args$hidden_units,
    activation = args$activation,
    trainable_reduction = args$trainable_reduction,
    output_activation = 'sigmoid',
    loss = 'binary_crossentropy',
    metrics = list('accuracy', metric.subspace(ds$B, normalize = TRUE))
    # metrics = list('accuracy')
)

with(ds, {
    ## Sample test dataset
    ds.test <- dataset(ds$name, n = 1000)

    nn$reset()
    nn$fit(X, Y, epochs = args$epochs,
           batch_size = args$batch_size, initializer = args$initializer,
           verbose = 2L)

    # `OPG` estimate
    cat("OPG subspace: ", dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE), '\n')
    cat("OPG grassmann:", dist.grassmann(B, coef(nn, 'OPG')), '\n')
    # Refinement estimate
    cat("Ref subspace: ", dist.subspace(B, coef(nn), normalize = TRUE), '\n')
    cat("Ref grassmann:", dist.grassmann(B, coef(nn)), '\n')
    # MSE
    cat("MSE: ", mean((nn$predict(ds.test$X) - ds.test$Y)^2), '\n')
})

library(ggplot2)

ggplot(nn$history, aes(x = epoch)) +
    geom_line(aes(y = loss), col = 'red') +
    geom_line(aes(y = accuracy), col = 'blue')

with(dataset('B2', n = 400), {
    ggplot(data.frame(XB1 = (X %*% B)[, 1], XB2 = (X %*% B)[, 2], Y = Y)) +
        geom_point(aes(x = XB1, y = XB2, col = Y))
})
add: binary simulation R script 2021-09-07 11:00:52 +00:00			`#!/usr/bin/env Rscript`

			`library(MAVE)`
			`library(CVarE)`
			Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings
			`suppressPackageStartupMessages({`
			`library(NNSDR)`
			`})`

			`## Parse script parameters`
			`args <- parse.args(defaults = list(`
			`# Simulation configuration`
			`reps = 100, # Number of replications`
			`dataset = 'B2', # Name ('B' for Binary) of the data set`
			`# Neuronal Net. structure/definitions`
			`hidden_units = 512L,`
			`activation = 'relu',`
			`trainable_reduction = TRUE,`
			`# Neuronal Net. training`
			epochs = c(100L, 200L), # Number of training epochs for (`OPG`, Refinement)
			`batch_size = 32L,`
			`initializer = 'fromOPG',`
			`seed = 956294L`
			`))`

			`################################################################################`

			`dataset <- function(name = "M1", n = NULL, p = 20, sd = 0.5, ...) {`
			`name <- toupper(name)`
			`if (nchar(name) == 1) { name <- paste0("M", name) }`

			`if (name == "B1") {`
			`if (missing(n)) { n <- 400 }`
			`if (missing(sd)) { sd <- 1 }`
			`eps <- sqrt(.Machine$double.eps)`

			`B <- diag(1, p, 2)`
			`Z <- matrix(rnorm(2 * n, 0, sd), n, 2)`
			`Y <- sample(c(FALSE, TRUE), n, replace = TRUE)`
			`theta <- rnorm(n, Y * pi, 0.25 * pi)`
			`X <- cbind(`
			`10 * (cos(theta) + 0.75 * (2 * Y - 1)) + Z[, 1],`
			`10 * sin(theta) + Z[, 2],`
			`matrix(rnorm(n * (p - 2), 0, sd), n)`
			`)`
			`} else if (name == "B2") {`
			`if (missing(n)) { n <- 400 }`
			`if (missing(sd)) { sd <- 0.2 }`
			`eps <- sqrt(.Machine$double.eps)`

			`B <- diag(1, p, 2)`
			`X <- matrix(runif(n * p, -2, 2), n, p)`
			`XB <- X %*% B`
			`Y <- (sin(XB[, 1]) / (XB[, 2]^2 + eps) + rnorm(n, 0, sd)) > 0`
			`# } else if (name == "B2") {`
			`# if (missing(n)) { n <- 400 }`
			`# if (missing(sd)) { sd <- 0.2 }`
			`# eps <- sqrt(.Machine$double.eps)`

			`# B <- diag(1, p, 2)`
			`# X <- matrix(runif(n * p, -2, 2), n, p)`
			`# XB <- X %*% B`
			`# Y <- (sin(XB[, 1]) / (XB[, 2]^2 + eps) + rnorm(n, 0, sd)) > 0`
			`} else {`
			`stop("Got unknown dataset name.")`
			`}`

			`return(list(X = X, Y = as.matrix(Y), B = B, name = name))`
			`}`

			`## Generate reference data (gets re-sampled for each replication)`
			ds <- dataset(args$dataset, n = 100) # Generates a list with `X`, `Y`, `B` and `name`
			`# plot(ds$X %% ds$B, col = 2 (ds$Y + 1))`

			`################################################################################`

			`metric.subspace <- function(B_true, name = "metric.subspace", normalize = FALSE) {`
			`if (!is.matrix(B_true))`
			`B_true <- as.matrix(B_true)`
			`P_true <- B_true %*% solve(crossprod(B_true), t(B_true))`
			`P_true <- tf$constant(P_true, dtype = 'float32')`

			`if (normalize) {`
			`rankSum <- 2 * ncol(B_true)`
			`c <- 1 / sqrt(min(rankSum, 2 * nrow(B_true) - rankSum))`
			`} else {`
			`c <- sqrt(2)`
			`}`
			`c <- tf$constant(c, dtype = 'float32')`

			`structure(function(model) {`
			`B <- model$get_layer('reduction')$weights`
			`function(y_true, y_pred) {`
			`P <- tf$linalg$matmul(B, B, transpose_b = TRUE)`
			`diff <- P_true - P`
			`c * tf$sqrt(tf$reduce_sum(tf$math$multiply(diff, diff)))`
			`}`
			`},`
			`class = c("nnsdr.metric", "Refinement"),`
			`name = name`
			`)`
			`}`

			`ds <- dataset(args$dataset)`

			`## Build Dimension Reduction Neuronal Network model (matching the data)`
			`nn <- nnsdr$new(`
			`input_shapes = list(x = ncol(ds$X)),`
			`d = ncol(ds$B),`
			`hidden_units = args$hidden_units,`
			`activation = args$activation,`
			`trainable_reduction = args$trainable_reduction,`
			`output_activation = 'sigmoid',`
			`loss = 'binary_crossentropy',`
			`metrics = list('accuracy', metric.subspace(ds$B, normalize = TRUE))`
			`# metrics = list('accuracy')`
			`)`

			`with(ds, {`
			`## Sample test dataset`
			`ds.test <- dataset(ds$name, n = 1000)`

			`nn$reset()`
			`nn$fit(X, Y, epochs = args$epochs,`
			`batch_size = args$batch_size, initializer = args$initializer,`
			`verbose = 2L)`

			# `OPG` estimate
			`cat("OPG subspace: ", dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE), '\n')`
			`cat("OPG grassmann:", dist.grassmann(B, coef(nn, 'OPG')), '\n')`
			`# Refinement estimate`
			`cat("Ref subspace: ", dist.subspace(B, coef(nn), normalize = TRUE), '\n')`
			`cat("Ref grassmann:", dist.grassmann(B, coef(nn)), '\n')`
			`# MSE`
			`cat("MSE: ", mean((nn$predict(ds.test$X) - ds.test$Y)^2), '\n')`
			`})`

			`library(ggplot2)`

			`ggplot(nn$history, aes(x = epoch)) +`
			`geom_line(aes(y = loss), col = 'red') +`
			`geom_line(aes(y = accuracy), col = 'blue')`

			`with(dataset('B2', n = 400), {`
			`ggplot(data.frame(XB1 = (X %% B)[, 1], XB2 = (X %% B)[, 2], Y = Y)) +`
			`geom_point(aes(x = XB1, y = XB2, col = Y))`
			`})`