NNSDR/NNSDR/R/datasets.R

#' Multivariate Normal Distribution.
#'
#' Random generation for the multivariate normal distribution.
#' \deqn{X \sim N_p(\mu, \Sigma)}{X ~ N_p(\mu, \Sigma)}
#'
#' @param n number of samples.
#' @param mu mean
#' @param sigma covariance matrix.
#'
#' @return a \eqn{n\times p}{n x p} matrix with samples in its rows.
#'
#' @examples
#' NNSDR:::rmvnorm(20, sigma = matrix(c(2, 1, 1, 2), 2))
#' NNSDR:::rmvnorm(20, mu = c(3, -1, 2))
#'
#' @keywords internal
rmvnorm <- function(n = 1, mu = rep(0, p), sigma = diag(p)) {
    if (!missing(sigma)) {
        p <- nrow(sigma)
    } else if (!missing(mu)) {
        mu <- matrix(mu, ncol = 1)
        p <- nrow(mu)
    } else {
        stop("At least one of 'mu' or 'sigma' must be supplied.")
    }

    return(rep(mu, each = n) + matrix(rnorm(n * p), n) %*% chol(sigma))
}

#' Multivariate t Distribution.
#'
#' Random generation from multivariate t distribution (student distribution).
#'
#' @param n number of samples.
#' @param mu mean
#' @param sigma a \eqn{k\times k}{k x k} positive definite matrix. If the degree
#' \eqn{\nu} if bigger than 2 the created covariance is
#' \deqn{var(x) = \Sigma\frac{\nu}{\nu - 2}}
#' for \eqn{\nu > 2}.
#' @param df degree of freedom \eqn{\nu}.
#'
#' @return a \eqn{n\times p}{n x p} matrix with samples in its rows.
#'
#' @examples
#' NNSDR:::rmvt(20, c(0, 1), matrix(c(3, 1, 1, 2), 2), 3)
#' NNSDR:::rmvt(20, sigma = matrix(c(2, 1, 1, 2), 2), df = 3)
#' NNSDR:::rmvt(20, mu = c(3, -1, 2), df = 3)
#'
#' @keywords internal
rmvt <- function(n = 1, mu = rep(0, p), sigma = diag(p), df = Inf) {
    if (!missing(sigma)) {
        p <- nrow(sigma)
    } else if (!missing(mu)) {
        mu <- matrix(mu, ncol = 1)
        p <- nrow(mu)
    } else {
        stop("At least one of 'mu' or 'sigma' must be supplied.")
    }

    if (df == Inf) {
        Z <- 1
    } else {
        Z <- sqrt(df / rchisq(n, df))
    }

    return(rmvnorm(n, sigma = sigma) * Z + rep(mu, each = n))
}

#' Generalized Normal Distribution.
#'
#' Random generation for generalized Normal Distribution.
#'
#' @param n Number of generated samples.
#' @param mu mean.
#' @param alpha first shape parameter.
#' @param beta second shape parameter.
#'
#' @return numeric array of length \eqn{n}.
#'
#' @keywords internal
rgnorm <- function(n = 1, mu = 0, alpha = 1, beta = 1) {
    if (alpha <= 0 | beta <= 0) {
        stop("alpha and beta must be positive.")
    }
    lambda <- (1 / alpha)^beta
    scales <- qgamma(runif(n), shape = 1 / beta, scale = 1 / lambda)^(1 / beta)
    return(scales * ((-1)^rbinom(n, 1, 0.5)) + mu)
}

#' Laplace distribution
#'
#' Random generation for Laplace distribution.
#'
#' @param n Number of generated samples.
#' @param mu mean.
#' @param sd standard deviation.
#'
#' @return numeric array of length \eqn{n}.
#'
#' @keywords internal
rlaplace <- function(n = 1, mu = 0, sd = 1) {
    U <- runif(n, -0.5, 0.5)
    scale <- sd / sqrt(2)

    return(mu - scale * sign(U) * log(1 - 2 * abs(U)))
}

#' Generates test datasets.
#'
#' Provides sample datasets.
#' The general model is given by:
#' \deqn{Y = g(B'X) + \epsilon}
#'
#' @param name One of \code{"M1"}, ..., \code{"M9"}.
#'      Alternative just the dataset number 1-9.
#' @param n number of samples.
#' @param p Dimension of random variable \eqn{X}.
#' @param sd standard diviation for error term \eqn{\epsilon}.
#' @param ... Additional parameters only for "M2" (namely \code{pmix} and
#' \code{lambda}), see: below.
#'
#' @return List with elements
#' \itemize{
#'      \item{X}{data, a \eqn{n\times p}{n x p} matrix.}
#'      \item{Y}{response.}
#'      \item{B}{the dim-reduction matrix}
#'      \item{name}{Name of the dataset (name parameter)}
#' }
#'
#' @section M1:
#' The predictors are distributed as
#' \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, \Sigma)} with
#' \eqn{\Sigma_{i, j} = 0.5^{|i - j|}}{\Sigma_ij = 0.5^|i - j|} for
#' \eqn{i, j = 1,..., p} for a subspace dimension of \eqn{k = 1} with a default
#' of \eqn{n = 100} data points. \eqn{p = 20},
#' \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)}, and \eqn{Y} is
#' given as \deqn{Y = cos(b_1'X) + \epsilon} where \eqn{\epsilon} is
#' distributed as generalized normal distribution with location 0,
#' shape-parameter 0.5, and the scale-parameter is chosen such that
#' \eqn{Var(\epsilon) = 0.5}.
#' @section M2:
#' The predictors are distributed as \eqn{X \sim Z 1_p \lambda + N_p(0, I_p)}{X ~ Z 1_p \lambda + N_p(0, I_p)}. with
#' \eqn{Z \sim 2 Binom(p_{mix}) - 1\in\{-1, 1\}}{Z~2Binom(pmix)-1} where
#' \eqn{1_p} is the \eqn{p}-dimensional vector of one's, for a subspace
#' dimension of \eqn{k = 1} with a default of \eqn{n = 100} data points.
#' \eqn{p = 20}, \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)},
#' and \eqn{Y} is \deqn{Y = cos(b_1'X) + 0.5\epsilon} where \eqn{\epsilon} is
#' standard normal.
#' Defaults for \code{pmix} is 0.3 and \code{lambda} defaults to 1.
#' @section M3:
#' The predictors are distributed as \eqn{X\sim N_p(0, I_p)}{X~N_p(0, I_p)}
#' for a subspace
#' dimension of \eqn{k = 1} with a default of \eqn{n = 100} data points.
#' \eqn{p = 20}, \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)},
#' and \eqn{Y} is
#' \deqn{Y = 2 log(|b_1'X| + 2) + 0.5\epsilon} where \eqn{\epsilon} is
#' standard normal.
#' @section M4:
#' The predictors are distributed as \eqn{X\sim N_p(0,\Sigma)}{X~N_p(0,\Sigma)}
#' with \eqn{\Sigma_{i, j} = 0.5^{|i - j|}}{\Sigma_ij = 0.5^|i - j|} for
#' \eqn{i, j = 1,..., p} for a subspace dimension of \eqn{k = 2} with a default
#' of \eqn{n = 100} data points. \eqn{p = 20},
#' \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)},
#' \eqn{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / sqrt(6)}
#' and \eqn{Y} is given as \deqn{Y = \frac{b_1'X}{0.5 + (1.5 + b_2'X)^2} + 0.5\epsilon}{Y = (b_1'X) / (0.5 + (1.5 + b_2'X)^2) + 0.5\epsilon}
#' where \eqn{\epsilon} is standard normal.
#' @section M5:
#' The predictors are distributed as \eqn{X\sim U([0,1]^p)}{X~U([0, 1]^p)}
#' where \eqn{U([0, 1]^p)} is the uniform distribution with
#' independent components on the \eqn{p}-dimensional hypercube for a subspace
#' dimension of \eqn{k = 2} with a default of \eqn{n = 200} data points.
#' \eqn{p = 20},
#' \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)},
#' \eqn{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / sqrt(6)}
#' and \eqn{Y} is given as \deqn{Y = cos(\pi b_1'X)(b_2'X + 1)^2 + 0.5\epsilon}
#' where \eqn{\epsilon} is standard normal.
#' @section M6:
#' The predictors are distributed as \eqn{X\sim N_p(0, I_p)}{X~N_p(0, I_p)}
#' for a subspace dimension of \eqn{k = 3} with a default of \eqn{n = 200} data
#' point. \eqn{p = 20, b_1 = e_1, b_2 = e_2}, and \eqn{b_3 = e_p}, where
#' \eqn{e_j} is the \eqn{j}-th unit vector in the \eqn{p}-dimensional space.
#' \eqn{Y} is given as \deqn{Y = (b_1'X)^2+(b_2'X)^2+(b_3'X)^2+0.5\epsilon}
#' where \eqn{\epsilon} is standard normal.
#' @section M7:
#' The predictors are distributed as \eqn{X\sim t_3(I_p)}{X~t_3(I_p)} where
#' \eqn{t_3(I_p)} is the standard multivariate t-distribution with 3 degrees of
#' freedom, for a subspace dimension of \eqn{k = 4} with a default of
#' \eqn{n = 200} data points.
#' \eqn{p = 20, b_1 = e_1, b_2 = e_2, b_3 = e_3}, and \eqn{b_4 = e_p}, where
#' \eqn{e_j} is the \eqn{j}-th unit vector in the \eqn{p}-dimensional space.
#' \eqn{Y} is given as \deqn{Y = (b_1'X)(b_2'X)^2+(b_3'X)(b_4'X)+0.5\epsilon}
#' where \eqn{\epsilon} is distributed as generalized normal distribution with
#' location 0, shape-parameter 1, and the scale-parameter is chosen such that
#' \eqn{Var(\epsilon) = 0.25}.
#'
#' @import stats
#' @importFrom stats rnorm rbinom
#' @export
dataset <- function(name = "M1", n = NULL, p = 20, sd = 0.5, ...) {
    name <- toupper(name)
    if (nchar(name) == 1) { name <- paste0("M", name) }

    if (name == "M1") {
        if (missing(n)) { n <- 100 }
        # B ... `p x 1`
        B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1)
        X <- rmvnorm(n, sigma = 0.5^abs(outer(1:p, 1:p, FUN = `-`)))
        beta <- 0.5
        Y <- cos(X %*% B) + rgnorm(n, 0,
            alpha = sqrt(sd^2 * gamma(1 / beta) / gamma(3 / beta)),
            beta = beta
        )
    } else if (name == "M2") {
        if (missing(n)) { n <- 100 }
        params <- list(...)
        pmix <- if (is.null(params$pmix)) { 0.3 } else { params$pmix }
        lambda <- if (is.null(params$lambda)) { 1 } else { params$lambda }
        # B ... `p x 1`
        B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1)
        Z <- 2 * rbinom(n, 1, pmix) - 1
        X <- matrix(rep(lambda * Z, p) + rnorm(n * p), n)
        Y <- cos(X %*% B) + rnorm(n, 0, sd)
    } else if (name == "M3") {
        if (missing(n)) { n <- 100 }
        # B ... `p x 1`
        B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1)
        X <- matrix(rnorm(n * p), n)
        Y <- 2 * log(2 + abs(X %*% B)) + rnorm(n, 0, sd)
    } else if (name == "M4") {
        if (missing(n)) { n <- 200 }
        # B ... `p x 2`
        B <- cbind(
            c(rep(1 / sqrt(6), 6), rep(0, p - 6)),
            c(rep(c(1, -1), 3) / sqrt(6), rep(0, p - 6))
        )
        X <- rmvnorm(n, sigma = 0.5^abs(outer(1:p, 1:p, FUN = `-`)))
        XB <- X %*% B
        Y <- (XB[, 1]) / (0.5 + (XB[, 2] + 1.5)^2) + rnorm(n, 0, sd)
    } else if (name == "M5") {
        if (missing(n)) { n <- 200 }
        # B ... `p x 2`
        B <- cbind(
            c(rep(1,        6), rep(0, p - 6)),
            c(rep(c(1, -1), 3), rep(0, p - 6))
        ) / sqrt(6)
        X <- matrix(runif(n * p), n)
        XB <- X %*% B
        Y <- cos(XB[, 1] * pi) * (XB[, 2] + 1)^2 + rnorm(n, 0, sd)
    } else if (name == "M6") {
        if (missing(n)) { n <- 200 }
        # B ... `p x 3`
        B <- diag(p)[, -(3:(p - 1))]
        X <- matrix(rnorm(n * p), n)
        Y <- rowSums((X %*% B)^2) + rnorm(n, 0, sd)
    } else if (name == "M7") {
        if (missing(n)) { n <- 400 }
        # B ... `p x 4`
        B <- diag(p)[, -(4:(p - 1))]
        # "R"andom "M"ulti"V"ariate "S"tudent
        X <- rmvt(n = n, sigma = diag(p), df = 3)
        XB <- X %*% B
        Y <- (XB[, 1]) * (XB[, 2])^2 + (XB[, 3]) * (XB[, 4])
        Y <- Y + rlaplace(n, 0, sd)
    } else if (name == "M8") {
        # see: "Local Linear Forests" <arXiv:1807.11408>
        if (missing(n)) { n <- 600 }
        if (missing(p)) { p <- 20 } # 10 and 50 in "Local Linear Forests"
        if (missing(sd)) { sd <- 5 } # or 20

        B <- diag(1, p, 4)
        B[, 4] <- c(0, 0, 0, 2, 1, rep(0, p - 5))

        X <- matrix(runif(n * p), n, p)
        XB <- X %*% B
        Y <- 10 * sin(pi * XB[, 1] * XB[, 2]) + 20 * (XB[, 3] - 0.5)^2 + 5 * XB[, 4] + rnorm(n, sd = sd)
        Y <- as.matrix(Y)
    } else if (name == "M9") {
        if (missing(n)) { n <- 300 }
        X <- matrix(rnorm(n * p), n, p)
        Y <- X[, 1] + (0.5 + X[, 2])^2 * rnorm(n)
        B <- diag(1, p, 2)
    } else {
        stop("Got unknown dataset name.")
    }

    return(list(X = X, Y = as.matrix(Y), B = B, name = name))
}