CVE/CVE_C/R/datasets.R

#'
#' @param n number of samples.
#' @param mu mean
#' @param sigma covariance matrix.
#'
#' @returns a \eqn{n\times p} matrix with samples in its rows.
#'
#' @examples
#' rmvnorm(20, sigma = matrix(c(2, 1, 1, 2), 2))
#' rmvnorm(20, mu = c(3, -1, 2))
rmvnorm <- function(n = 1, mu = rep(0, p), sigma = diag(p)) {
    if (!missing(sigma)) {
        p <- nrow(sigma)
    } else if (!missing(mu)) {
        mu <- matrix(mu, ncol = 1)
        p <- nrow(mu)
    } else {
        stop("At least one of 'mu' or 'sigma' must be supplied.")
    }

    # See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution
    return(rep(mu, each = n) + matrix(rnorm(n * p), n) %*% chol(sigma))
}

#' Samples from the multivariate t distribution (student distribution).
#'
#' @param n number of samples.
#' @param mu mean, ... TODO:
#' @param sigma a \eqn{k\times k} positive definite matrix. If the degree
#' \eqn{\nu} if bigger than 2 the created covariance is
#' \deqn{var(x) = \Sigma\frac{\nu}{\nu - 2}}
#' for \eqn{\nu > 2}.
#' @param df degree of freedom \eqn{\nu}.
#'
#' @returns a \eqn{n\times p} matrix with samples in its rows.
#'
#' @examples
#' rmvt(20, c(0, 1), matrix(c(3, 1, 1, 2), 2), 3)
#' rmvt(20, sigma = matrix(c(2, 1, 1, 2), 2), 3)
#' rmvt(20, mu = c(3, -1, 2), 3)
rmvt <- function(n = 1, mu = rep(0, p), sigma = diag(p), df = Inf) {
    if (!missing(sigma)) {
        p <- nrow(sigma)
    } else if (!missing(mu)) {
        mu <- matrix(mu, ncol = 1)
        p <- nrow(mu)
    } else {
        stop("At least one of 'mu' or 'sigma' must be supplied.")
    }

    if (df == Inf) {
        Z <- 1
    } else {
        Z <- sqrt(df / rchisq(n, df))
    }

    return(rmvnorm(n, sigma = sigma) * Z + rep(mu, each = n))
}


#' Generalized Normal Distribution.
#' see: https://en.wikipedia.org/wiki/Generalized_normal_distribution
rgnorm <- function(n = 1, mu = 0, alpha = 1, beta = 1) {
    if (alpha <= 0 | beta <= 0) {
        stop("alpha and beta must be positive.")
    }
    lambda <- (1 / alpha)^beta
    scales <- qgamma(runif(n), shape = 1 / beta, scale = 1 / lambda)^(1 / beta)
    return(scales * ((-1)^rbinom(n, 1, 0.5)) + mu)
}

#' Laplace distribution
#' see: https://en.wikipedia.org/wiki/Laplace_distribution
rlaplace <- function(n = 1, mu = 0, sigma = 1) {
    U <- runif(n, -0.5, 0.5)
    scale <- sigma / sqrt(2)

    return(mu - scale * sign(U) * log(1 - 2 * abs(U)))
}

#' Generates test datasets.
#'
#' Provides sample datasets. There are 5 different datasets named
#' M1, M2, M3, M4 and M5 described in the paper references below.
#' The general model is given by:
#' \deqn{Y = g(B'X) + \epsilon}
#'
#' @param name One of \code{"M1"}, \code{"M2"}, \code{"M3"}, \code{"M4"} or \code{"M5"}
#' @param n nr samples
#' @param B SDR basis used for dataset creation if supplied.
#' @param p Dim. of random variable \code{X}.
#' @param p.mix Only for \code{"M4"}, see: below.
#' @param lambda Only for \code{"M4"}, see: below.
#'
#' @return List with elements
#' \itemize{
#'      \item{X}{data}
#'      \item{Y}{response}
#'      \item{B}{Used dim-reduction matrix}
#'      \item{name}{Name of the dataset (name parameter)}
#' }
#'
#' @section M1:
#' The data follows \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, Sigma)} for a subspace
#' dimension of \eqn{k = 2} with a default of \eqn{n = 200} data points.
#' The link function \eqn{g} is given as
#' \deqn{g(x) = \frac{x_1}{0.5 + (x_2 + 1.5)^2} + \epsilon / 2}{%
#'       g(x) = x_1 / (0.5 + (x_2 + 1.5)^2) + epsilon / 2}
#' @section M2:
#' \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, Sigma)} with \eqn{k = 2} with a
#' default of \eqn{n = 200} data points.
#' The link function \eqn{g} is given as
#' \deqn{g(x) = (b_1^T X) (b_2^T X)^2 + \epsilon / 2}
#' @section M3:
#' \deqn{g(x) = cos(b_1^T X) + \epsilon / 2}
#' @section M4:
#' TODO:
#' @section M5:
#' TODO:
#'
#' @import stats
#' @importFrom stats rnorm rbinom
#' @export
dataset <- function(name = "M1", n = NULL, p = 20, sigma = 0.5, ...) {
    name <- toupper(name)
    if (nchar(name) == 1) { name <- paste0("M", name) }

    if (name == "M1") {
        if (missing(n)) { n <- 100 }
        # B ... `p x 1`
        B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1)
        X <- rmvnorm(n, sigma = sigma^abs(outer(1:p, 1:p, FUN = `-`)))
        beta <- 0.5
        Y <- cos(X %*% B) + rgnorm(n, 0,
            alpha = sqrt(0.25 * gamma(1 / beta) / gamma(3 / beta)),
            beta = beta
        )
    } else if (name == "M2") {
        if (missing(n)) { n <- 100 }
        prob <- 0.3
        lambda <- 1 # dispersion
        # B ... `p x 1`
        B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1)
        Z <- 2 * rbinom(n, 1, prob) - 1
        X <- matrix(rep(lambda * Z, p) + rnorm(n * p), n)
        Y <- cos(X %*% B) + rnorm(n, 0, sigma)
    } else if (name == "M3") {
        if (missing(n)) { n <- 200 }
        # B ... `p x 1`
        B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1)
        X <- matrix(rnorm(n * p), n)
        Y <- 1.5 * log(2 + abs(X %*% B)) + rnorm(n, 0, sigma^2)
    } else if (name == "M4") {
        if (missing(n)) { n <- 200 }
        # B ... `p x 2`
        B <- cbind(
            c(rep(1 / sqrt(6), 6), rep(0, p - 6)),
            c(rep(c(1, -1), 3) / sqrt(6), rep(0, p - 6))
        )
        X <- rmvnorm(n, sigma = sigma^abs(outer(1:p, 1:p, FUN = `-`)))
        XB <- X %*% B
        Y <- (XB[, 1]) / (0.5 + (XB[, 2] + 1.5)^2) + rnorm(n, 0, sigma^2)
    } else if (name == "M5") {
        if (missing(n)) { n <- 200 }
        # B ... `p x 2`
        B <- cbind(
            c(rep(1,        6), rep(0, p - 6)),
            c(rep(c(1, -1), 3), rep(0, p - 6))
        ) / sqrt(6)
        X <- matrix(runif(n * p), n)
        XB <- X %*% B
        Y <- cos(XB[, 1] * pi) * (XB[, 2] + 1)^2 + rnorm(n, 0, sigma^2)
    } else if (name == "M6") {
        if (missing(n)) { n <- 200 }
        # B ... `p x 3`
        B <- diag(p)[, -(3:(p - 1))]
        X <- matrix(rnorm(n * p), n)
        Y <- rowSums((X %*% B)^2) + rnorm(n, 0, sigma^2)
    } else if (name == "M7") {
        if (missing(n)) { n <- 400 }
        # B ... `p x 4`
        B <- diag(p)[, -(4:(p - 1))]
        # "R"andom "M"ulti"V"ariate "S"tudent
        X <- rmvt(n = n, sigma = diag(p), df = 3)
        XB <- X %*% B
        Y <- (XB[, 1]) * (XB[, 2])^2 + (XB[, 3]) * (XB[, 4])
        Y <- Y + rlaplace(n, 0, sigma)
    } else {
        stop("Got unknown dataset name.")
    }

    return(list(X = X, Y = Y, B = B, name = name))
}