add: simulation scripts

add: real_data scripts
init: NNSDR implementation (package)
2021-04-20 20:04:59 +02:00 · 2021-04-20 19:42:33 +02:00 · 2021-04-20 19:30:35 +02:00
20 changed files with 1319 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,27 @@
+
+LaTeX/
+
+**/data/*
+**/results/*
+
+.vscode/
+
+# Generated man files in R package (auto generated by Roxygen)
+NNSDR/man/
+
+# Large Files / Data Files
+*.pdf
+*.png
+*.csv
+*.Rdata
+*.zip
+*.tar.gz
+*.tar.xz
+*.BAK
+
+# LaTeX - build/database/... files
+*.log
+*.aux
+*.bbl
+*.blg
+*.out
--- a/NNSDR/DESCRIPTION
+++ b/NNSDR/DESCRIPTION
@ -0,0 +1,13 @@
+Package: NNSDR
+Type: Package
+Title: Fusing Neuronal Networks with Sufficient Dimension Reduction
+Version: 0.1
+Date: 2021-04-19
+Author: Daniel Kapla [aut]
+Maintainer: Daniel Kapla <daniel@kapla.at>
+Description: Compinint the Outer Product of Gradients (OPG) with Neuronal Networks
+    for estimation of the mean subspace.
+License: GPL-3
+Depends: methods, tensorflow (>= 2.2.0)
+Encoding: UTF-8
+RoxygenNote: 7.0.2
--- a/NNSDR/NAMESPACE
+++ b/NNSDR/NAMESPACE
@ -0,0 +1,18 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(coef,nnsdr)
+S3method(summary,nnsdr)
+export(dataset)
+export(dist.grassmann)
+export(dist.subspace)
+export(get.script)
+export(nnsdr)
+export(parse.args)
+export(reinitialize_weights)
+export(reset_optimizer)
+exportClasses(nnsdr)
+import(methods)
+import(stats)
+import(tensorflow)
+importFrom(stats,rbinom)
+importFrom(stats,rnorm)
--- a/NNSDR/R/NNSDR.R
+++ b/NNSDR/R/NNSDR.R
@ -0,0 +1,8 @@
+#' Mean Subspace estimation with Neural Nets
+#'
+#' Package for simulations using Neural Nets for Mean Subspace estimation.
+#'
+#' @author Daniel Kapla
+#'
+#' @docType package
+"_PACKAGE"
--- a/NNSDR/R/coef_nnsdr.R
+++ b/NNSDR/R/coef_nnsdr.R
@ -0,0 +1,18 @@
+#' Extracts the OPG or refined reduction coefficients from an nnsdr class instance
+#' 
+#' @param object nnsdr class instance
+#' @param type specifies if the OPG or Refinement estimate is requested.
+#'  One of `Refinement` or `OPG`, default is `Refinement`.
+#' @param ... ignored.
+#' 
+#' @return Matrix
+#'
+#' @method coef nnsdr
+#' @export
+coef.nnsdr <- function(object, type, ...) {
+    if (missing(type)) {
+        object$coef()
+    } else {
+        object$coef(type)
+    }
+}
--- a/NNSDR/R/datasets.R
+++ b/NNSDR/R/datasets.R
@ -0,0 +1,263 @@
+#' Multivariate Normal Distribution.
+#'
+#' Random generation for the multivariate normal distribution.
+#' \deqn{X \sim N_p(\mu, \Sigma)}{X ~ N_p(\mu, \Sigma)}
+#'
+#' @param n number of samples.
+#' @param mu mean
+#' @param sigma covariance matrix.
+#'
+#' @return a \eqn{n\times p}{n x p} matrix with samples in its rows.
+#'
+#' @examples
+#' NNSDR:::rmvnorm(20, sigma = matrix(c(2, 1, 1, 2), 2))
+#' NNSDR:::rmvnorm(20, mu = c(3, -1, 2))
+#'
+#' @keywords internal
+rmvnorm <- function(n = 1, mu = rep(0, p), sigma = diag(p)) {
+    if (!missing(sigma)) {
+        p <- nrow(sigma)
+    } else if (!missing(mu)) {
+        mu <- matrix(mu, ncol = 1)
+        p <- nrow(mu)
+    } else {
+        stop("At least one of 'mu' or 'sigma' must be supplied.")
+    }
+
+    return(rep(mu, each = n) + matrix(rnorm(n * p), n) %*% chol(sigma))
+}
+
+#' Multivariate t Distribution.
+#'
+#' Random generation from multivariate t distribution (student distribution).
+#'
+#' @param n number of samples.
+#' @param mu mean
+#' @param sigma a \eqn{k\times k}{k x k} positive definite matrix. If the degree
+#' \eqn{\nu} if bigger than 2 the created covariance is
+#' \deqn{var(x) = \Sigma\frac{\nu}{\nu - 2}}
+#' for \eqn{\nu > 2}.
+#' @param df degree of freedom \eqn{\nu}.
+#'
+#' @return a \eqn{n\times p}{n x p} matrix with samples in its rows.
+#'
+#' @examples
+#' NNSDR:::rmvt(20, c(0, 1), matrix(c(3, 1, 1, 2), 2), 3)
+#' NNSDR:::rmvt(20, sigma = matrix(c(2, 1, 1, 2), 2), df = 3)
+#' NNSDR:::rmvt(20, mu = c(3, -1, 2), df = 3)
+#'
+#' @keywords internal
+rmvt <- function(n = 1, mu = rep(0, p), sigma = diag(p), df = Inf) {
+    if (!missing(sigma)) {
+        p <- nrow(sigma)
+    } else if (!missing(mu)) {
+        mu <- matrix(mu, ncol = 1)
+        p <- nrow(mu)
+    } else {
+        stop("At least one of 'mu' or 'sigma' must be supplied.")
+    }
+
+    if (df == Inf) {
+        Z <- 1
+    } else {
+        Z <- sqrt(df / rchisq(n, df))
+    }
+
+    return(rmvnorm(n, sigma = sigma) * Z + rep(mu, each = n))
+}
+
+#' Generalized Normal Distribution.
+#'
+#' Random generation for generalized Normal Distribution.
+#'
+#' @param n Number of generated samples.
+#' @param mu mean.
+#' @param alpha first shape parameter.
+#' @param beta second shape parameter.
+#'
+#' @return numeric array of length \eqn{n}.
+#'
+#' @keywords internal
+rgnorm <- function(n = 1, mu = 0, alpha = 1, beta = 1) {
+    if (alpha <= 0 | beta <= 0) {
+        stop("alpha and beta must be positive.")
+    }
+    lambda <- (1 / alpha)^beta
+    scales <- qgamma(runif(n), shape = 1 / beta, scale = 1 / lambda)^(1 / beta)
+    return(scales * ((-1)^rbinom(n, 1, 0.5)) + mu)
+}
+
+#' Laplace distribution
+#'
+#' Random generation for Laplace distribution.
+#'
+#' @param n Number of generated samples.
+#' @param mu mean.
+#' @param sd standard deviation.
+#'
+#' @return numeric array of length \eqn{n}.
+#'
+#' @keywords internal
+rlaplace <- function(n = 1, mu = 0, sd = 1) {
+    U <- runif(n, -0.5, 0.5)
+    scale <- sd / sqrt(2)
+
+    return(mu - scale * sign(U) * log(1 - 2 * abs(U)))
+}
+
+#' Generates test datasets.
+#'
+#' Provides sample datasets.
+#' The general model is given by:
+#' \deqn{Y = g(B'X) + \epsilon}
+#'
+#' @param name One of \code{"M1"}, ..., \code{"M9"}.
+#'      Alternative just the dataset number 1-9.
+#' @param n number of samples.
+#' @param p Dimension of random variable \eqn{X}.
+#' @param sd standard diviation for error term \eqn{\epsilon}.
+#' @param ... Additional parameters only for "M2" (namely \code{pmix} and
+#' \code{lambda}), see: below.
+#'
+#' @return List with elements
+#' \itemize{
+#'      \item{X}{data, a \eqn{n\times p}{n x p} matrix.}
+#'      \item{Y}{response.}
+#'      \item{B}{the dim-reduction matrix}
+#'      \item{name}{Name of the dataset (name parameter)}
+#' }
+#'
+#' @section M1:
+#' The predictors are distributed as
+#' \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, \Sigma)} with
+#' \eqn{\Sigma_{i, j} = 0.5^{|i - j|}}{\Sigma_ij = 0.5^|i - j|} for
+#' \eqn{i, j = 1,..., p} for a subspace dimension of \eqn{k = 1} with a default
+#' of \eqn{n = 100} data points. \eqn{p = 20},
+#' \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)}, and \eqn{Y} is
+#' given as \deqn{Y = cos(b_1'X) + \epsilon} where \eqn{\epsilon} is
+#' distributed as generalized normal distribution with location 0,
+#' shape-parameter 0.5, and the scale-parameter is chosen such that
+#' \eqn{Var(\epsilon) = 0.5}.
+#' @section M2:
+#' The predictors are distributed as \eqn{X \sim Z 1_p \lambda + N_p(0, I_p)}{X ~ Z 1_p \lambda + N_p(0, I_p)}. with
+#' \eqn{Z \sim 2 Binom(p_{mix}) - 1\in\{-1, 1\}}{Z~2Binom(pmix)-1} where
+#' \eqn{1_p} is the \eqn{p}-dimensional vector of one's, for a subspace
+#' dimension of \eqn{k = 1} with a default of \eqn{n = 100} data points.
+#' \eqn{p = 20}, \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)},
+#' and \eqn{Y} is \deqn{Y = cos(b_1'X) + 0.5\epsilon} where \eqn{\epsilon} is
+#' standard normal.
+#' Defaults for \code{pmix} is 0.3 and \code{lambda} defaults to 1.
+#' @section M3:
+#' The predictors are distributed as \eqn{X\sim N_p(0, I_p)}{X~N_p(0, I_p)}
+#' for a subspace
+#' dimension of \eqn{k = 1} with a default of \eqn{n = 100} data points.
+#' \eqn{p = 20}, \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)},
+#' and \eqn{Y} is 
+#' \deqn{Y = 2 log(|b_1'X| + 2) + 0.5\epsilon} where \eqn{\epsilon} is
+#' standard normal.
+#' @section M4:
+#' The predictors are distributed as \eqn{X\sim N_p(0,\Sigma)}{X~N_p(0,\Sigma)}
+#' with \eqn{\Sigma_{i, j} = 0.5^{|i - j|}}{\Sigma_ij = 0.5^|i - j|} for
+#' \eqn{i, j = 1,..., p} for a subspace dimension of \eqn{k = 2} with a default
+#' of \eqn{n = 100} data points. \eqn{p = 20},
+#' \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)},
+#' \eqn{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / sqrt(6)}
+#' and \eqn{Y} is given as \deqn{Y = \frac{b_1'X}{0.5 + (1.5 + b_2'X)^2} + 0.5\epsilon}{Y = (b_1'X) / (0.5 + (1.5 + b_2'X)^2) + 0.5\epsilon}
+#' where \eqn{\epsilon} is standard normal.
+#' @section M5:
+#' The predictors are distributed as \eqn{X\sim U([0,1]^p)}{X~U([0, 1]^p)}
+#' where \eqn{U([0, 1]^p)} is the uniform distribution with
+#' independent components on the \eqn{p}-dimensional hypercube for a subspace
+#' dimension of \eqn{k = 2} with a default of \eqn{n = 200} data points.
+#' \eqn{p = 20},
+#' \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)},
+#' \eqn{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / sqrt(6)}
+#' and \eqn{Y} is given as \deqn{Y = cos(\pi b_1'X)(b_2'X + 1)^2 + 0.5\epsilon}
+#' where \eqn{\epsilon} is standard normal.
+#' @section M6:
+#' The predictors are distributed as \eqn{X\sim N_p(0, I_p)}{X~N_p(0, I_p)}
+#' for a subspace dimension of \eqn{k = 3} with a default of \eqn{n = 200} data
+#' point. \eqn{p = 20, b_1 = e_1, b_2 = e_2}, and \eqn{b_3 = e_p}, where
+#' \eqn{e_j} is the \eqn{j}-th unit vector in the \eqn{p}-dimensional space.
+#' \eqn{Y} is given as \deqn{Y = (b_1'X)^2+(b_2'X)^2+(b_3'X)^2+0.5\epsilon}
+#' where \eqn{\epsilon} is standard normal.
+#' @section M7: see "Local Linear Forests" <arXiv:1807.11408>
+#'
+#' @import stats
+#' @importFrom stats rnorm rbinom
+#' @export
+dataset <- function(name = "M1", n = NULL, p = 20, sd = 0.5, ...) {
+    name <- toupper(name)
+    if (nchar(name) == 1) { name <- paste0("M", name) }
+
+    if (name == "M1") {
+        if (missing(n)) { n <- 100 }
+        # B ... `p x 1`
+        B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1)
+        X <- rmvnorm(n, sigma = 0.5^abs(outer(1:p, 1:p, FUN = `-`)))
+        beta <- 0.5
+        Y <- cos(X %*% B) + rgnorm(n, 0,
+            alpha = sqrt(sd^2 * gamma(1 / beta) / gamma(3 / beta)),
+            beta = beta
+        )
+    } else if (name == "M2") {
+        if (missing(n)) { n <- 100 }
+        params <- list(...)
+        pmix <- if (is.null(params$pmix)) { 0.3 } else { params$pmix }
+        lambda <- if (is.null(params$lambda)) { 1 } else { params$lambda }
+        # B ... `p x 1`
+        B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1)
+        Z <- 2 * rbinom(n, 1, pmix) - 1
+        X <- matrix(rep(lambda * Z, p) + rnorm(n * p), n)
+        Y <- cos(X %*% B) + rnorm(n, 0, sd)
+    } else if (name == "M3") {
+        if (missing(n)) { n <- 100 }
+        # B ... `p x 1`
+        B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1)
+        X <- matrix(rnorm(n * p), n)
+        Y <- 2 * log(2 + abs(X %*% B)) + rnorm(n, 0, sd)
+    } else if (name == "M4") {
+        if (missing(n)) { n <- 200 }
+        # B ... `p x 2`
+        B <- cbind(
+            c(rep(1 / sqrt(6), 6), rep(0, p - 6)),
+            c(rep(c(1, -1), 3) / sqrt(6), rep(0, p - 6))
+        )
+        X <- rmvnorm(n, sigma = 0.5^abs(outer(1:p, 1:p, FUN = `-`)))
+        XB <- X %*% B
+        Y <- (XB[, 1]) / (0.5 + (XB[, 2] + 1.5)^2) + rnorm(n, 0, sd)
+    } else if (name == "M5") {
+        if (missing(n)) { n <- 200 }
+        # B ... `p x 2`
+        B <- cbind(
+            c(rep(1,        6), rep(0, p - 6)),
+            c(rep(c(1, -1), 3), rep(0, p - 6))
+        ) / sqrt(6)
+        X <- matrix(runif(n * p), n)
+        XB <- X %*% B
+        Y <- cos(XB[, 1] * pi) * (XB[, 2] + 1)^2 + rnorm(n, 0, sd)
+    } else if (name == "M6") {
+        if (missing(n)) { n <- 200 }
+        # B ... `p x 3`
+        B <- diag(p)[, -(3:(p - 1))]
+        X <- matrix(rnorm(n * p), n)
+        Y <- rowSums((X %*% B)^2) + rnorm(n, 0, sd)
+    } else if (name == "M7") {
+        # see: "Local Linear Forests" <arXiv:1807.11408>
+        if (missing(n)) { n <- 600 }
+        if (missing(p)) { p <- 20 }  # 10 and 50 in "Local Linear Forests"
+        if (missing(sd)) { sd <- 5 } # or 20
+
+        B <- diag(1, p, 4)
+        B[, 4] <- c(0, 0, 0, 2, 1, rep(0, p - 5))
+
+        X <- matrix(runif(n * p), n, p)
+        XB <- X %*% B
+        Y <- 10 * sin(pi * XB[, 1] * XB[, 2]) + 20 * (XB[, 3] - 0.5)^2 + 5 * XB[, 4] + rnorm(n, sd = sd)
+        Y <- as.matrix(Y)
+    } else {
+        stop("Got unknown dataset name.")
+    }
+
+    return(list(X = X, Y = as.matrix(Y), B = B, name = name))
+}
--- a/NNSDR/R/dist_grassmann.R
+++ b/NNSDR/R/dist_grassmann.R
@ -0,0 +1,24 @@
+#' Grassmann distance
+#'
+#' @param A,B Basis matrices as representations of elements of the Grassmann
+#'  manifold.
+#' @param is.ortho Boolean to specify if `A` and `B` are semi-orthogonal (if
+#'  false, both arguments are `qr` decomposed)
+#' @param tol passed to `qr`, ignored if `is.ortho` is `true`.
+#'
+#' @seealso
+#'  K. Ye and L.-H. Lim (2016) "Schubert varieties and distances between
+#'  subspaces of different dimensions" <arXiv:1407.0900>
+#'
+#' @export
+dist.grassmann <- function(A, B, is.ortho = FALSE, tol = 1e-7) {
+    if (!is.ortho) {
+        A <- qr.Q(qr(A, tol))
+        B <- qr.Q(qr(B, tol))
+    } else {
+        A <- as.matrix(A)
+        B <- as.matrix(B)
+    }
+
+    sqrt(sum(acos(pmin(La.svd(crossprod(A, B), 0L, 0L)$d, 1))^2))
+}
--- a/NNSDR/R/dist_subspace.R
+++ b/NNSDR/R/dist_subspace.R
@ -0,0 +1,37 @@
+#' Subspace distance
+#'
+#' @param A,B Basis matrices as representations of elements of the Grassmann
+#'  manifold.
+#' @param is.ortho Boolean to specify if `A` and `B` are semi-orthogonal. If
+#'  false, the projection matrices are computed as
+#' \deqn{P_A = A (A' A)^{-1} A'}
+#'  otherwise just \eqn{P_A = A A'} since \eqn{A' A} is the identity.
+#' @param normalize Boolean to specify if the distance shall be normalized.
+#'  Meaning, the maximal distance scaled to be \eqn{1} independent of dimensions.
+#' 
+#' @seealso
+#'  K. Ye and L.-H. Lim (2016) "Schubert varieties and distances between
+#'  subspaces of different dimensions" <arXiv:1407.0900>
+#' 
+#' @export
+dist.subspace <- function (A, B, is.ortho = FALSE, normalize = FALSE) {
+    if (!is.matrix(A)) A <- as.matrix(A)
+    if (!is.matrix(B)) B <- as.matrix(B)
+
+    if (is.ortho) {
+        PA <- tcrossprod(A, A)
+        PB <- tcrossprod(B, B)
+    } else {
+        PA <- A %*% solve(t(A) %*% A, t(A))
+        PB <- B %*% solve(t(B) %*% B, t(B))
+    }
+
+    if (normalize) {
+        rankSum <- ncol(A) + ncol(B)
+        c <- 1 / sqrt(min(rankSum, 2 * nrow(A) - rankSum))
+    } else {
+        c <- sqrt(2)
+    }
+
+    c * norm(PA - PB, type = "F")
+}
--- a/NNSDR/R/get_script.R
+++ b/NNSDR/R/get_script.R
@ -0,0 +1,12 @@
+
+#' Gets the `Rscript` file name
+#'
+#' @note only relevant in scripts (useless in interactive `R` sessions)
+#'
+#' @return character array of the file name or empty
+#'
+#' @export
+get.script <- function() {
+    args <- commandArgs()
+    sub('--file=', '', args[startsWith(args, '--file=')])
+}
--- a/NNSDR/R/nnsdr_class.R
+++ b/NNSDR/R/nnsdr_class.R
@ -0,0 +1,279 @@
+Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3")
+
+
+
+#' Build MLP
+#'
+#' @param input_shapes TODO:
+#' @param d TODO:
+#' @param name TODO:
+#' @param add_reduction TODO:
+#' @param hidden_units TODO:
+#' @param activation TODO:
+#' @param dropout TODO:
+#' @param loss TODO:
+#' @param optimizer TODO:
+#' @param metrics TODO:
+#' @param trainable_reduction TODO:
+#'
+#' @import tensorflow
+#' @keywords internal
+build.MLP <- function(input_shapes, d, name, add_reduction,
+    output_shape        = 1L,
+    hidden_units        = 512L,
+    activation          = 'relu',
+    dropout             = 0.4,
+    loss                = 'MSE',
+    optimizer           = 'RMSProp',
+    metrics             = NULL,
+    trainable_reduction = TRUE
+) {
+    K <- tf$keras
+
+    inputs <- Map(K$layers$Input,
+        shape = as.integer(input_shapes), # drops names (concatenate key error)
+        name = if (is.null(names(input_shapes))) "" else names(input_shapes)
+    )
+
+    mlp_inputs <- if (add_reduction) {
+        reduction <- K$layers$Dense(
+            units = d,
+            use_bias = FALSE,
+            kernel_constraint = function(w) { # polar projection
+                lhs <- tf$linalg$sqrtm(tf$matmul(w, w, transpose_a = TRUE))
+                tf$transpose(tf$linalg$solve(lhs, tf$transpose(w)))
+            },
+            trainable = trainable_reduction,
+            name = 'reduction'
+        )(inputs[[1]])
+
+        c(reduction, inputs[-1])
+    } else {
+        inputs
+    }
+
+    out <- if (length(inputs) == 1) {
+        mlp_inputs[[1]]
+    } else {
+        K$layers$concatenate(mlp_inputs, axis = 1L, name = 'input_mlp')
+    }
+    for (i in seq_along(hidden_units)) {
+        out <- K$layers$Dense(units = hidden_units[i], activation = activation,
+                              name = paste0('hidden', i))(out)
+        if (dropout > 0)
+            out <- K$layers$Dropout(rate = dropout, name = paste0('dropout', i))(out)
+    }
+    out <- K$layers$Dense(units = output_shape, name = 'output')(out)
+
+    mlp <- K$models$Model(inputs = inputs, outputs = out, name = name)
+    mlp$compile(loss = loss, optimizer = optimizer, metrics = metrics)
+
+    mlp
+}
+
+#' Base Neuronal Network model class
+#'
+#' @examples
+#' model <- nnsdr$new(
+#'     input_shapes = list(x = 7L),
+#'     d = 2L, hidden_units = 128L
+#' )
+#'
+#' @import methods tensorflow
+#' @export nnsdr
+#' @exportClass nnsdr
+nnsdr <- setRefClass('nnsdr',
+    fields = list(
+        config = 'list',
+        nn.opg = 'ANY',
+        nn.ref = 'ANY',
+        history.opg = 'ANY',
+        history.ref = 'ANY',
+        B.opg = 'ANY',
+        B.ref = 'ANY',
+        history = function() {
+            if (is.null(.self$history.opg))
+                return(NULL)
+
+            history <- data.frame(
+                .self$history.opg,
+                model = factor('OPG'),
+                epoch = seq_len(nrow(.self$history.opg))
+            )
+
+            if (!is.null(.self$history.ref))
+                history <- rbind(history, data.frame(
+                    .self$history.ref,
+                    model = factor('Refinement'),
+                    epoch = seq_len(nrow(.self$history.ref))
+                ))
+
+            history
+        }
+    ),
+
+    methods = list(
+        initialize = function(input_shapes, d, output_shape = 1L, ...) {
+            # Set configuration.
+            .self$config <- c(list(
+                input_shapes = input_shapes,
+                d            = as.integer(d),
+                output_shape = output_shape
+            ), list(...))
+
+            # Build OPG (Step 1) and Refinement (Step 2) Neuronal Networks
+            .self$nn.opg <- do.call(build.MLP, c(.self$config, list(
+                name = 'OPG', add_reduction = FALSE
+            )))
+            .self$nn.ref <- do.call(build.MLP, c(.self$config, list(
+                name = 'Refinement', add_reduction = TRUE
+            )))
+
+            # Set initial history field values. If and only if the `history.*`
+            # fields are `NULL`, then the Nets are NOT trained.
+            .self$history.opg <- NULL
+            .self$history.ref <- NULL
+
+            # Set (not jet available) reduction estimates
+            .self$B.opg <- NULL
+            .self$B.ref <- NULL
+        },
+
+        fit = function(inputs, output, epochs = 1L, batch_size = 32L,
+            initializer = c('random', 'fromOPG'), ..., verbose = 0L
+        ) {
+            if (is.list(inputs)) {
+                inputs <- Map(tf$cast, as.list(inputs), dtype = 'float32')
+            } else {
+                inputs <- list(tf$cast(inputs, dtype = 'float32'))
+            }
+            initializer <- match.arg(initializer)
+
+            # Check for OPG history (Step 1), if available skip it.
+            if (is.null(.self$history.opg)) {
+                # Fit OPG Net and store training history.
+                hist <- .self$nn.opg$fit(inputs, output, ...,
+                    epochs = as.integer(head(epochs, 1)),
+                    batch_size = as.integer(head(batch_size, 1)),
+                    verbose = as.integer(verbose)
+                )
+                .self$history.opg <- as.data.frame(hist$history)
+            } else if (verbose > 0) {
+                cat("OPG already trained -> skip OPG training.\n")
+            }
+
+            # Compute OPG estimate of the Reduction matrix 'B'.
+            # Always compute, different inputs change the estimate.
+            with(tf$GradientTape() %as% tape, {
+                tape$watch(inputs[[1]])
+                out <- .self$nn.opg(inputs)
+            })
+            G <- as.matrix(tape$gradient(out, inputs[[1]]))
+            B <- eigen(var(G), symmetric = TRUE)$vectors
+            B <- B[, 1:.self$config$d, drop = FALSE]
+            .self$B.opg <- B
+
+            # Check for need to initialize the Refinement Net.
+            if (is.null(.self$history.ref)) {
+                # Set Reduction layer
+                .self$nn.ref$get_layer('reduction')$set_weights(list(B))
+
+                # Check initialization (for random keep random initialization)
+                if (initializer == 'fromOPG') {
+                    # Initialize Refinement Net weights from the OPG Net.
+                    W <- as.array(.self$nn.opg$get_layer('hidden1')$kernel)
+                    W <- rbind(
+                        t(B) %*% W[1:nrow(B), , drop = FALSE],
+                        W[-(1:nrow(B)), , drop = FALSE]
+                    )
+                    b <- as.array(.self$nn.opg$get_layer('hidden1')$bias)
+                    .self$nn.ref$get_layer('hidden1')$set_weights(list(W, b))
+                    # Get layer names with weights to be initialized from `nn.opg`
+                    # These are the output layer and all hidden layers except the first
+                    layer.names <- Filter(function(name) {
+                        if (name == 'output') {
+                            TRUE
+                        } else if (name == 'hidden1') {
+                            FALSE
+                        } else {
+                            startsWith(name, 'hidden')
+                        }
+                    }, lapply(.self$nn.opg$layers, `[[`, 'name'))
+                    # Copy `nn.opg` weights to `nn.ref`
+                    for (name in layer.names) {
+                        .self$nn.ref$get_layer(name)$set_weights(lapply(
+                            .self$nn.opg$get_layer(name)$weights, as.array
+                        ))
+                    }
+                }
+            } else if (verbose > 0) {
+                cat("Refinement Net already trained -> continue training.\n")
+            }
+
+            # Fit (or continue fitting) the Refinement Net.
+            hist <- .self$nn.ref$fit(inputs, output, ...,
+                epochs = as.integer(tail(epochs, 1)),
+                batch_size = as.integer(tail(batch_size, 1)),
+                verbose = as.integer(verbose)
+            )
+            .self$history.ref <- rbind(
+                .self$history.ref,
+                as.data.frame(hist$history)
+            )
+            # Extract refined reduction estimate
+            .self$B.ref <- .self$nn.ref$get_layer('reduction')$get_weights()[[1]]
+
+            invisible(NULL)
+        },
+        predict = function(inputs) {
+            # Issue warning if the Refinement model (Step 2) used for prediction
+            # is not trained.
+            if (is.null(.self$history.ref))
+                warning('Refinement model not trained.')
+
+            if (is.list(inputs)) {
+                inputs <- Map(tf$cast, as.list(inputs), dtype = 'float32')
+            } else {
+                inputs <- list(tf$cast(inputs, dtype = 'float32'))
+            }
+            output <- .self$nn.ref(inputs)
+
+            if (is.list(output)) {
+                if (length(output) == 1L) {
+                    as.array(output[[1]])
+                } else {
+                    Map(as.array, output)
+                }
+            } else {
+                as.array(output)
+            }
+        },
+        coef = function(type = c('Refinement', 'OPG')) {
+            type <- match.arg(type)
+            if (type == 'Refinement') {
+                .self$B.ref
+            } else {
+                .self$B.opg
+            }
+        },
+        reset = function(reset = c('both', 'Refinement')) {
+            reset <- match.arg(reset)
+            if (reset == 'both') {
+                reinitialize_weights(.self$nn.opg)
+                reset_optimizer(.self$nn.opg$optimizer)
+                .self$history.opg <- NULL
+                .self$B.opg <- NULL
+            }
+            reinitialize_weights(.self$nn.ref)
+            reset_optimizer(.self$nn.ref$optimizer)
+            .self$history.ref <- NULL
+            .self$B.ref <- NULL
+        },
+        summary = function() {
+            .self$nn.opg$summary()
+            cat('\n')
+            .self$nn.ref$summary()
+        }
+    )
+)
+nnsdr$lock('config')
--- a/NNSDR/R/parse_args.R
+++ b/NNSDR/R/parse_args.R
@ -0,0 +1,35 @@
+
+#' Parses script arguments.
+#'
+#' @param defaults list of default parameters. Names of the provided defaults
+#'  define the allowed parameters.
+#' @param args Arguments to parge, if missing the Rscript params are taken.
+#'
+#' @return calling script arguments of `Rscript`
+#'
+#' @export
+parse.args <- function(defaults, args) {
+    if (missing(args))
+        args <- commandArgs(trailingOnly = TRUE)
+    if (length((args)) == 0)
+        return(defaults)
+
+    args <- strsplit(sub('--', '', args, fixed = TRUE), '=')
+    values <- Map(`[`, args, 2)
+    names(values) <- unlist(Map(`[`, args, 1))
+
+    if (!all(names(values) %in% names(defaults)))
+        stop('Found unknown script parameter')
+
+    for (i in seq_along(defaults)) {
+        name <- names(defaults)[i]
+
+        if (name %in% names(values)) {
+            value <- unlist(strsplit(values[[name]], ',', fixed = TRUE))
+            value <- eval(call(paste0('as.', typeof(defaults[[name]])), value))
+            defaults[[name]] <- value
+        }
+    }
+
+    defaults
+}
--- a/NNSDR/R/reinitialize_weights.R
+++ b/NNSDR/R/reinitialize_weights.R
@ -0,0 +1,42 @@
+
+#' Re-initialize model weights.
+#'
+#' An in-place model re-initialization. Intended for simulations to avoid
+#' rebuilding the same model architecture for multiple simulation runs.
+#'
+#' @param model A `keras` model.
+#'
+#' @seealso https://github.com/keras-team/keras/issues/341
+#' @examples
+#' # library(tensorflow) # v2
+#' K <- tf$keras
+#' model <- K$models$Sequential(list(
+#'     K$layers$Dense(units = 7L, input_shape = list(3L)),
+#'     K$layers$Dense(units = 1L)
+#' ))
+#' model$compile(loss = 'MSE', optimizer = K$optimizers$RMSprop())
+#'
+#' model$weights
+#' reinitialize_weights(model)
+#' model$weights
+#'
+#' @export
+reinitialize_weights <- function(model) {
+    for (layer in model$layers) {
+        # Unwrap wrapped layers.
+        if (any(endsWith(class(layer), 'Wrapper')))
+            layer <- layer$layer
+        # Re-initialize kernel and bias weight variables.
+        for (var in layer$weights) {
+            if (any(grep('/recurrent_kernel:', var$name, fixed = TRUE))) {
+                var$assign(layer$recurrent_initializer(var$shape, var$dtype))
+            } else if (any(grep('/kernel:', var$name, fixed = TRUE))) {
+                var$assign(layer$kernel_initializer(var$shape, var$dtype))
+            } else if (any(grep('/bias:', var$name, fixed = TRUE))) {
+                var$assign(layer$bias_initializer(var$shape, var$dtype))
+            } else {
+                stop("Unknown initialization for variable ", var$name)
+            }
+        }
+    }
+}
--- a/NNSDR/R/reset_optimizer.R
+++ b/NNSDR/R/reset_optimizer.R
@ -0,0 +1,39 @@
+
+#' Reset TensorFlow optimizer.
+#'
+#' @param optimizer a \pkg{tensorflow} optimizer instance
+#'
+#' @examples
+#' # Create example toy data
+#'
+#' # library(tensorflow) # v2
+#' K <- tf$keras
+#' model <- K$models$Sequential(list(
+#'     K$layers$Dense(units = 7L, input_shape = list(3L)),
+#'     K$layers$Dense(units = 1L)
+#' ))
+#' model$compile(loss = 'MSE', optimizer = 'RMSprop')
+#'
+#' \donttest{
+#' model$fit(input) # Fit the model
+#' }
+#'
+#' reinitialize_weights(model)
+#' reset_optimizer(model$optimizer)
+#'
+#' \donttest{
+#' model$fit(input) # Fit the model again completely independent of the first fit.
+#' }
+#'
+#' @note Works for Adam, RMSprop properly (other optimizes are not tested!)
+#' @note see source and search for `_create_slots` and `add_slot`.
+#' @seealso https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+#' @seealso https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/optimizer_v2/rmsprop.py
+#' @seealso https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/optimizer_v2/adam.py
+#'
+#' @export
+reset_optimizer <- function(optimizer) {
+    for (var in optimizer$variables()) {
+        var$assign(tf$zeros_like(var))
+    }
+}
--- a/NNSDR/R/summary_nnsdr.R
+++ b/NNSDR/R/summary_nnsdr.R
@ -0,0 +1,12 @@
+#' Summary for the OPG and Refinement model of an nnsdr class instance
+#' 
+#' @param object nnsdr class instance
+#' @param ... ignored.
+#' 
+#' @return No return value, prints human readable summary.
+#'
+#' @method summary nnsdr
+#' @export
+summary.nnsdr <- function(object, ...) {
+    object$summary()
+}
--- a/real_data/Beijing_Multi_Site_Air_Quality_Data.R
+++ b/real_data/Beijing_Multi_Site_Air_Quality_Data.R
@ -0,0 +1,79 @@
+#!/usr/bin/env Rscript
+## data source: https://archive.ics.uci.edu/ml/datasets/Beijing+Multi-Site+Air-Quality+Data
+
+library(mda)
+Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings
+suppressPackageStartupMessages({
+    library(NNSDR)
+})
+
+## Configuration
+d <- 4L                     # reduction dimension
+epochs = c(2L, 3L)        # training epochs (OPG, Refinement)
+
+## Loading one site of the "Beijing Air Quality" data set
+files <- list.files('data/Beijing\ Multi\ Site\ Air\ Quality\ Data/',
+                    pattern = '*.csv', full.names = TRUE)
+ds <- na.omit(Reduce(rbind, lapply(files, read.csv)))
+
+## Create model matrix with dummy variables for factors (One-Hot encoded) for
+## regression of PM2.5 (and dropping PM10)
+X <- model.matrix(~ year + month + day + hour + SO2 + NO2 + CO + O3 + TEMP +
+                    PRES + DEWP + RAIN + wd + WSPM + station + 0, ds)
+Y <- as.matrix(ds$PM2.5)
+
+## Build Dimension Reduction Neuronal Network model (matching the data)
+nn <- nnsdr$new(
+    input_shapes = list(x = ncol(X)),
+    d = d, # Reduction dimension
+    hidden_units = 512L,
+    activation = 'relu'
+)
+
+## Open simulation log file, write simulation configuration and header
+log <- file(format(Sys.time(), "results/Beijing_Air_Quality.csv"), "w", blocking = FALSE)
+cat('# d = ', d, '\n# epochs = ', epochs[1], ',', epochs[2], '\n',
+    'method,fold,mse,var(Y.test),time.user,time.system,time.elapsed\n',
+    sep = '', file = log, append = TRUE)
+
+## K-Fold Cross Validation
+K <- 10
+for (i in 1:K) {
+    ## Split into train/test sets
+    train <- (1:K) != i
+    X.train <- scale(X[train, ])
+    Y.train <-       Y[train, , drop = FALSE]
+    X.test <- scale(X[!train, ], center = attr(X.train, 'scaled:center'),
+                                  scale = attr(X.train, 'scaled:scale'))
+    Y.test <-       Y[!train, , drop = FALSE]
+
+    ## Training
+    time <- system.time(nn$fit(X.train, Y.train, epochs = epochs, initializer = 'fromOPG'))
+
+    mse <- mean((nn$predict(X.test) - Y.test)^2)
+    cat('"nn.ref",', i, ',', mse, ',', c(var(Y.test)), ',',
+        time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
+        sep = '', file = log, append = TRUE)
+
+    ## Linear Model
+    time <- system.time(lm.mod <- lm(y ~ ., data.frame(X.train, y = Y.train)))
+
+    mse <- mean((predict(lm.mod, data.frame(X.test, y = Y.test)) - Y.test)^2)
+    cat('"lm",', i, ',', mse, ',', c(var(Y.test)), ',',
+        time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
+        sep = '', file = log, append = TRUE)
+
+    ## MARS
+    time <- system.time(mars.mod <- mars(X.train, Y.train))
+
+    mse <- mean((predict(mars.mod, X.test) - Y.test)^2)
+    cat('"mars",', i, ',', mse, ',', c(var(Y.test)), ',',
+        time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
+        sep = '', file = log, append = TRUE)
+
+    ## Reset model
+    nn$reset()
+}
+
+## Finished, close simulation log file
+close(log)
--- a/real_data/BostonHousing.R
+++ b/real_data/BostonHousing.R
@ -0,0 +1,70 @@
+#!/usr/bin/env Rscript
+
+library(MAVE)
+library(CVarE)
+library(mlbench)
+Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings
+suppressPackageStartupMessages({
+    library(NNSDR)
+})
+
+## Configuration
+d <- 1L
+
+## Load Boston Housing data set
+data('BostonHousing')
+ds <- BostonHousing[, names(BostonHousing) != 'chas']
+
+## Build Dimension Reduction Neuronal Network model (matching the data)
+nn <- nnsdr$new(
+    input_shapes = list(x = ncol(ds) - 1L),
+    d = d, # Reduction dimension
+    hidden_units = 512L,
+    activation = 'relu'
+)
+
+## Open simulation log file, write simulation configuration and header
+log <- file(format(Sys.time(), "results/BostonHousing.csv"), "w", blocking = FALSE)
+cat('# d = ', d, '\n', 'method,fold,mse\n', sep = '', file = log, append = TRUE)
+
+## K-Fold Cross Validation
+K <- nrow(ds)   # with K == nrow(ds) -> LOO CV
+for (i in 1:K) {
+    ds.train <- ds[(1:K) != i, ]
+    ds.test <- ds[(1:K) == i, , drop = FALSE]
+    X.train <- as.matrix(ds.train[, names(ds) != 'medv'])
+    Y.train <- as.matrix(ds.train[, 'medv'])
+    X.test <- as.matrix(ds.test[, names(ds) != 'medv'])
+    Y.test <- as.matrix(ds.test[, 'medv'])
+
+    ## Fit `DR` Neuronal Network model
+    nn$fit(X.train, Y.train, epochs = c(200L, 400L), initializer = 'fromOPG')
+    Y.pred <- nn$predict(X.test)
+    mse <- mean((Y.pred - Y.test)^2)
+    cat('"nn.ref",', i, ',', mse, '\n',
+        sep = '', file = log, append = TRUE)
+
+    ## MAVE as reference
+    dr <- mave.compute(X.train, Y.train, method = 'meanMAVE', max.dim = d)
+    Y.pred <- predict(dr, X.test, d)
+    mse <- mean((Y.pred - Y.test)^2)
+    cat('"mave",', i, ',', mse, '\n',
+        sep = '', file = log, append = TRUE)
+
+    ## and CVE
+    X.scaled <- scale(X.train)
+    dr <- cve.call(X.scaled, Y.train, k = d)
+    Y.pred <- predict(dr, scale(X.test,
+                                scale = attr(X.scaled, 'scaled:scale'),
+                                center = attr(X.scaled, 'scaled:center')),
+                      k = d)
+    mse <- mean((Y.pred - Y.test)^2)
+    cat('"cve",', i, ',', mse, '\n',
+        sep = '', file = log, append = TRUE)
+
+    ## Reset model
+    nn$reset()
+}
+
+## Finished, close simulation log file
+close(log)
--- a/real_data/kc_house_data.R
+++ b/real_data/kc_house_data.R
@ -0,0 +1,86 @@
+#!/usr/bin/env Rscript
+## data source: the `MAVE` R-package
+
+library(MAVE)
+library(CVarE)
+Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings
+suppressPackageStartupMessages({
+    library(NNSDR)
+})
+
+## Configuration
+d <- 1L                     # reduction dimension
+epochs = c(50L, 100L)       # training epochs
+dropped <- c('id', 'date', 'zipcode') #, 'sqft_basement')  # columns to be dropped
+
+## Loading the "House Price in King Counte, USA" data set provided by MAVE
+data('kc_house_data')
+ds <- kc_house_data[, !(names(kc_house_data) %in% dropped)]
+
+## Build Dimension Reduction Neuronal Network model (matching the data)
+nn <- nnsdr$new(
+    input_shapes = list(x = ncol(ds) - 1L),
+    d = d, # Reduction dimension
+    hidden_units = 512L,
+    activation = 'relu'
+)
+
+## Open simulation log file, write simulation configuration and header
+log <- file(format(Sys.time(), "results/kc_house_data.csv"), "w", blocking = FALSE)
+cat('# d = ', d, '\n# epochs = ', epochs[1], ',', epochs[2], '\n',
+    '# dropped = ', paste(dropped, collapse = ', '), '\n',
+    'method,fold,mse,var(Y.test),time.user,time.system,time.elapsed\n',
+    sep = '', file = log, append = TRUE)
+
+## K-Fold Cross Validation
+K <- 10
+for (i in 1:K) {
+    ds.train <- ds[(1:K) != i, ]
+    ds.test <- ds[(1:K) == i, , drop = FALSE]
+    X.train <- as.matrix(ds.train[, names(ds) != 'price'])
+    Y.train <- as.matrix(ds.train[, 'price'])
+    X.test <- as.matrix(ds.test[, names(ds) != 'price'])
+    Y.test <- as.matrix(ds.test[, 'price'])
+
+    ## Fit `DR` Neuronal Network model
+    time <- system.time(nn$fit(X.train, Y.train, epochs = epochs, initializer = 'fromOPG'))
+
+    mse <- mean((nn$predict(X.test) - Y.test)^2)
+    cat('"nn.ref",', i, ',', mse, ',', c(var(Y.test)), ',',
+        time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
+        sep = '', file = log, append = TRUE)
+
+    ## `MAVE`
+    time <- system.time(dr <- mave.compute(X.train, Y.train, method = 'meanMAVE', max.dim = d))
+
+    # Sometimes the `mda` package fails -> predict with NA/NaN/Inf value error.
+    mse <- tryCatch(mean((predict(dr, X.test, d) - Y.test)^2),
+                    error = function(err) NA)
+    cat('"mave",', i, ',', mse, ',', c(var(Y.test)), ',',
+        time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
+        sep = '', file = log, append = TRUE)
+
+    # Current implementation requires too much memory (CVarE v1.1). Run on `VSC`.
+    # ## and CVE
+    # X.scaled <- scale(X.train)
+    # time <- system.time(dr <- cve.call(X.scaled, Y.train, k = d))
+
+    # # Might have the same problem as MAVE since we use `mda` as well.
+    # mse <- tryCatch({
+    #         Y.pred <- predict(dr, scale(X.test,
+    #                                     scale = attr(X.scaled, 'scaled:scale'),
+    #                                     center = attr(X.scaled, 'scaled:center')),
+    #                           k = d)
+    #         mean((Y.pred - Y.test)^2)
+    #     },
+    #     error = function(err) NA)
+    # cat('"cve",', i, ',', mse, ',', c(var(Y.test)), ',',
+    #     time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
+    #     sep = '', file = log, append = TRUE)
+
+    ## Reset model
+    nn$reset()
+}
+
+## Finished, close simulation log file
+close(log)
--- a/simulations/colin_test.R
+++ b/simulations/colin_test.R
@ -0,0 +1,48 @@
+library(MAVE)
+library(CVarE)
+library(NNSDR)
+
+set.seed(797)
+
+dataset <- function(n = 100, p = 10, sd = 0.5) {
+    X <- matrix(rnorm(n * (p - 1)), n, p - 1)
+    X <- cbind(-0.5 * (X[, 1] + X[, 2]) + 0.001 * rnorm(n), X)
+    B <- diag(p)[, 4, drop = FALSE]
+    Y <- as.matrix(X[, 4]^2 + rnorm(n, 0, sd))
+    return(list(X = X, Y = Y, B = B))
+}
+
+nn <- nnsdr$new(
+    input_shapes = list(x = 10L),
+    d = 1L,
+    hidden_units = 512L,
+    activation = 'relu',
+    trainable_reduction = TRUE
+)
+
+sim <- data.frame(mave = rep(NA, 10), cve = NA, nn.opg = NA, nn.ref = NA)
+for (i in 1:10) {
+    cat(i, "/ 10\n")
+    with(dataset(), {
+        dr <- mave.compute(X, Y, method = 'meanMAVE', max.dim = 1)
+        sim[i, 'mave'] <<- dist.subspace(B, coef(dr, 1), normalize = TRUE)
+
+        dr <- cve.call(X, Y, k = 1)
+        sim[i, 'cve'] <<- dist.subspace(B, coef(dr, 1), normalize = TRUE)
+
+        nn$fit(X, Y, epochs = c(200L, 400L), batch_size = 32L, initializer = 'fromOPG')
+        sim[i, 'nn.opg'] <<- dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE)
+        sim[i, 'nn.ref'] <<- dist.subspace(B, coef(nn), normalize = TRUE)
+    })
+    nn$reset()
+}
+round(t(data.frame(
+    mean   = colMeans(sim),
+    median = apply(sim, 2, median),
+    sd     = apply(sim, 2, sd)
+)), 3)
+
+#        &$\mave$& $\cve$&$\nn_{512}$ \\
+# mean   & 0.917 & 0.164 & 0.101 \\
+# median & 0.999 & 0.162 & 0.096 \\
+# sd     & 0.256 & 0.057 & 0.032 \\
--- a/simulations/simulations.R
+++ b/simulations/simulations.R
@ -0,0 +1,98 @@
+#!/usr/bin/env Rscript
+
+library(MAVE)
+library(CVarE)
+Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings
+suppressPackageStartupMessages({
+    library(NNSDR)
+})
+
+## Parse script parameters
+args <- parse.args(defaults = list(
+    # Simulation configuration
+    reps = 100,             # Number of replications
+    dataset = '1',          # Name (number) of the data set
+    # Neuronal Net. structure/definitions
+    hidden_units = 512L,
+    activation = 'relu',    # or `relu`
+    trainable_reduction = TRUE,
+    # Neuronal Net. training
+    epochs = c(200L, 400L), # Number of training epochs for (`OPG`, Refinement)
+    batch_size = 32L,
+    initializer = 'fromOPG',
+    seed = 1390L
+))
+
+## Generate reference data (gets re-sampled for each replication)
+ds <- dataset(args$dataset) # Generates a list with `X`, `Y`, `B` and `name`
+
+## Build Dimension Reduction Neuronal Network model (matching the data)
+nn <- nnsdr$new(
+    input_shapes = list(x = ncol(ds$X)),
+    d = ncol(ds$B),
+    hidden_units = args$hidden_units,
+    activation = args$activation,
+    trainable_reduction = args$trainable_reduction
+)
+
+## Open simulation log file, write simulation configuration and header
+log <- file(format(Sys.time(), "results/sim_%Y%m%d_%H%M.csv"), "w", blocking = FALSE)
+cat(paste('#', names(args), args, sep = ' ', collapse = '\n'), '\n',
+    'method,replication,dist.subspace,dist.grassmann,mse\n',
+    sep = '', file = log, append = TRUE)
+
+## Set seed for sampling simulation data (NOT effecting the `NN` initialization)
+set.seed(args$seed)
+
+## Repeated simulation runs
+for (rep in seq_len(args$reps)) {
+    ## Re-sample seeded data for each simulation replication
+    with(dataset(ds$name), {
+        ## Sample test dataset
+        ds.test <- dataset(ds$name, n = 1000)
+
+        ## First the reference method `MAVE`
+        dr <- mave(Y ~ X, method = "meanMAVE")
+        d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
+        d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
+        mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
+        cat('"mave",', rep, ',', d.sub, ',', d.gra, ',', mse, '\n', sep = '',
+            file = log, append = TRUE)
+        ## and the `OPG` method
+        dr <- mave(Y ~ X, method = "meanOPG")
+        d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
+        d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
+        mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
+        cat('"opg",', rep, ',', d.sub, ',', d.gra, ',', mse, '\n', sep = '',
+            file = log, append = TRUE)
+
+        ## Next the `CVE` method
+        dr <- cve(Y ~ X, k = ncol(B))
+        d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
+        d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
+        mse <- mean((predict(dr, ds.test$X, k = ncol(B)) - ds.test$Y)^2)
+        cat('"cve",', rep, ',', d.sub, ',', d.gra, ',', mse, '\n', sep = '',
+            file = log, append = TRUE)
+
+        ## Fit `NNSDR` model
+        nn$fit(X, Y, epochs = args$epochs,
+            batch_size = args$batch_size, initializer = args$initializer)
+        # `OPG` estimate
+        d.sub <- dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE)
+        d.gra <- dist.grassmann(B, coef(nn, 'OPG'))
+        cat('"nn.opg",', rep, ',', d.sub, ',', d.gra, ',', NA, '\n', sep = '',
+            file = log, append = TRUE)
+        # Refinement estimate
+        d.sub <- dist.subspace(B, coef(nn), normalize = TRUE)
+        d.gra <- dist.grassmann(B, coef(nn))
+        mse <- mean((nn$predict(ds.test$X) - ds.test$Y)^2)
+        cat('"nn.ref",', rep, ',', d.sub, ',', d.gra, ',', mse, '\n', sep = '',
+            file = log, append = TRUE)
+    })
+
+    ## Reset model
+    nn$reset()
+}
+
+## Finished, close simulation log file
+close(log)
--- a/simulations/simulations_bigdata.R
+++ b/simulations/simulations_bigdata.R
@ -0,0 +1,111 @@
+#!/usr/bin/env Rscript
+
+library(MAVE)
+library(CVarE)
+Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings
+suppressPackageStartupMessages({
+    library(NNSDR)
+})
+
+## Parse script parameters
+args <- parse.args(defaults = list(
+    # Simulation configuration
+    reps = 10L,             # Number of replications
+    dataset = '6',          # Name (number) of the data set
+    # Neuronal Net. structure/definitions
+    hidden_units = 512L,
+    activation = 'relu',
+    trainable_reduction = TRUE,
+    # Neuronal Net. training
+    epochs = c(200L, 400L), # Number of training epochs for (`OPG`, Refinement)
+    batch_size = 32L,
+    initializer = 'fromOPG',
+    # Simulation data generation configuration
+    seed = 1390L,
+    n = 100L,
+    p = 20L
+))
+
+## Generate reference data (gets re-sampled for each replication)
+# Number of observations are irrelevant for the reference to generate a matching
+# `NNSDR` estimator
+ds <- dataset(args$dataset, n = 100L, p = args$p) # Generates a list with `X`, `Y`, `B` and `name`
+
+## Build Dimension Reduction Neuronal Network model (matching the data)
+nn <- nnsdr$new(
+    input_shapes = list(x = ncol(ds$X)),
+    d = ncol(ds$B),
+    hidden_units = args$hidden_units,
+    activation = args$activation,
+    trainable_reduction = args$trainable_reduction
+)
+
+## Open simulation log file, write simulation configuration and header
+log <- file(format(Sys.time(), "results/sim_big_%Y%m%d_%H%M.csv"), "w", blocking = FALSE)
+cat(paste('#', names(args), args, sep = ' ', collapse = '\n'), '\n',
+    'method,replication,dist.subspace,dist.grassmann,mse,time.user,time.system,time.elapsed\n',
+    sep = '', file = log, append = TRUE)
+
+## Repeated simulation runs
+for (rep in seq_len(args$reps)) {
+    ## Re-sample seeded data for each simulation replication
+    with(dataset(ds$name, n = args$n, p = args$p), {
+        ## Sample test dataset
+        ds.test <- dataset(ds$name, n = 1000L, p = args$p)
+
+        ## First the reference method `MAVE`
+        # To be fair for measuring the time, set `max.dim` to true reduction dimension
+        # and with `screen = ncol(X)` screening is turned "off".
+        time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B),
+            method = "meanMAVE", screen = ncol(X)))
+        d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
+        d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
+        mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
+        cat('"mave",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
+            time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
+            sep = '', file = log, append = TRUE)
+        ## and the `OPG` method
+        time <- system.time(dr <- mave.compute(X, Y, max.dim = ncol(B),
+            method = "meanOPG", screen = ncol(X)))
+        d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
+        d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
+        mse <- mean((predict(dr, ds.test$X, dim = ncol(B)) - ds.test$Y)^2)
+        cat('"opg",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
+            time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
+            sep = '', file = log, append = TRUE)
+
+        ## Next the CVE method
+        time <- system.time(dr <- cve.call(X, Y, k = ncol(B)))
+        d.sub <- dist.subspace(B, coef(dr, ncol(B)), normalize = TRUE)
+        d.gra <- dist.grassmann(B, coef(dr, ncol(B)))
+        mse <- mean((predict(dr, ds.test$X, k = ncol(B)) - ds.test$Y)^2)
+        cat('"cve",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
+            time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
+            sep = '', file = log, append = TRUE)
+
+        ## Fit `DR` Neuronal Network model
+        time <- system.time(nn$fit(X, Y, epochs = args$epochs,
+            batch_size = args$batch_size, initializer = args$initializer))
+        # OPG estimate
+        d.sub <- dist.subspace(B, coef(nn, 'OPG'), normalize = TRUE)
+        d.gra <- dist.grassmann(B, coef(nn, 'OPG'))
+        cat('"nn.opg",', rep, ',', d.sub, ',', d.gra, ',NA,NA,NA,NA\n',
+            sep = '', file = log, append = TRUE)
+        # Refinement estimate
+        d.sub <- dist.subspace(B, coef(nn), normalize = TRUE)
+        d.gra <- dist.grassmann(B, coef(nn))
+        mse <- mean((nn$predict(ds.test$X) - ds.test$Y)^2)
+        cat('"nn.ref",', rep, ',', d.sub, ',', d.gra, ',', mse, ',',
+            time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
+            sep = '', file = log, append = TRUE)
+    })
+
+    ## Invoke the garbage collector
+    gc()
+
+    ## Reset model
+    nn$reset()
+}
+
+## Finished, close simulation log file
+close(log)
Author	SHA1	Message	Date
Daniel Kapla	0214823794	add: simulation scripts	2021-04-20 20:04:59 +02:00
Daniel Kapla	2c5a87ccfe	add: real_data scripts	2021-04-20 19:42:33 +02:00
Daniel Kapla	6e4f3d30a7	init: NNSDR implementation (package)	2021-04-20 19:30:35 +02:00