#' #' @param n number of samples. #' @param mu mean #' @param sigma covariance matrix. #' #' @returns a \eqn{n\times p} matrix with samples in its rows. #' #' @examples #' rmvnorm(20, sigma = matrix(c(2, 1, 1, 2), 2)) #' rmvnorm(20, mu = c(3, -1, 2)) rmvnorm <- function(n = 1, mu = rep(0, p), sigma = diag(p)) { if (!missing(sigma)) { p <- nrow(sigma) } else if (!missing(mu)) { mu <- matrix(mu, ncol = 1) p <- nrow(mu) } else { stop("At least one of 'mu' or 'sigma' must be supplied.") } # See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution return(rep(mu, each = n) + matrix(rnorm(n * p), n) %*% chol(sigma)) } #' Samples from the multivariate t distribution (student distribution). #' #' @param n number of samples. #' @param mu mean, ... TODO: #' @param sigma a \eqn{k\times k} positive definite matrix. If the degree #' \eqn{\nu} if bigger than 2 the created covariance is #' \deqn{var(x) = \Sigma\frac{\nu}{\nu - 2}} #' for \eqn{\nu > 2}. #' @param df degree of freedom \eqn{\nu}. #' #' @returns a \eqn{n\times p} matrix with samples in its rows. #' #' @examples #' rmvt(20, c(0, 1), matrix(c(3, 1, 1, 2), 2), 3) #' rmvt(20, sigma = matrix(c(2, 1, 1, 2), 2), 3) #' rmvt(20, mu = c(3, -1, 2), 3) rmvt <- function(n = 1, mu = rep(0, p), sigma = diag(p), df = Inf) { if (!missing(sigma)) { p <- nrow(sigma) } else if (!missing(mu)) { mu <- matrix(mu, ncol = 1) p <- nrow(mu) } else { stop("At least one of 'mu' or 'sigma' must be supplied.") } if (df == Inf) { Z <- 1 } else { Z <- sqrt(df / rchisq(n, df)) } return(rmvnorm(n, sigma = sigma) * Z + rep(mu, each = n)) } #' Generalized Normal Distribution. #' see: https://en.wikipedia.org/wiki/Generalized_normal_distribution rgnorm <- function(n = 1, mu = 0, alpha = 1, beta = 1) { if (alpha <= 0 | beta <= 0) { stop("alpha and beta must be positive.") } lambda <- (1 / alpha)^beta scales <- qgamma(runif(n), shape = 1 / beta, scale = 1 / lambda)^(1 / beta) return(scales * ((-1)^rbinom(n, 1, 0.5)) + mu) } #' Laplace distribution #' see: https://en.wikipedia.org/wiki/Laplace_distribution rlaplace <- function(n = 1, mu = 0, sigma = 1) { U <- runif(n, -0.5, 0.5) scale <- sigma / sqrt(2) return(mu - scale * sign(U) * log(1 - 2 * abs(U))) } #' Generates test datasets. #' #' Provides sample datasets. There are 5 different datasets named #' M1, M2, M3, M4 and M5 described in the paper references below. #' The general model is given by: #' \deqn{Y = g(B'X) + \epsilon} #' #' @param name One of \code{"M1"}, \code{"M2"}, \code{"M3"}, \code{"M4"} or \code{"M5"} #' @param n nr samples #' @param B SDR basis used for dataset creation if supplied. #' @param p Dim. of random variable \code{X}. #' @param p.mix Only for \code{"M4"}, see: below. #' @param lambda Only for \code{"M4"}, see: below. #' #' @return List with elements #' \itemize{ #' \item{X}{data} #' \item{Y}{response} #' \item{B}{Used dim-reduction matrix} #' \item{name}{Name of the dataset (name parameter)} #' } #' #' @section M1: #' The data follows \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, Sigma)} for a subspace #' dimension of \eqn{k = 2} with a default of \eqn{n = 200} data points. #' The link function \eqn{g} is given as #' \deqn{g(x) = \frac{x_1}{0.5 + (x_2 + 1.5)^2} + \epsilon / 2}{% #' g(x) = x_1 / (0.5 + (x_2 + 1.5)^2) + epsilon / 2} #' @section M2: #' \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, Sigma)} with \eqn{k = 2} with a #' default of \eqn{n = 200} data points. #' The link function \eqn{g} is given as #' \deqn{g(x) = (b_1^T X) (b_2^T X)^2 + \epsilon / 2} #' @section M3: #' \deqn{g(x) = cos(b_1^T X) + \epsilon / 2} #' @section M4: #' TODO: #' @section M5: #' TODO: #' #' @import stats #' @importFrom stats rnorm rbinom #' @export dataset <- function(name = "M1", n = NULL, p = 20, sigma = 0.5, ...) { name <- toupper(name) if (nchar(name) == 1) { name <- paste0("M", name) } if (name == "M1") { if (missing(n)) { n <- 100 } # B ... `p x 1` B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1) X <- rmvnorm(n, sigma = sigma^abs(outer(1:p, 1:p, FUN = `-`))) beta <- 0.5 Y <- cos(X %*% B) + rgnorm(n, 0, alpha = sqrt(0.25 * gamma(1 / beta) / gamma(3 / beta)), beta = beta ) } else if (name == "M2") { if (missing(n)) { n <- 100 } prob <- 0.3 lambda <- 1 # dispersion # B ... `p x 1` B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1) Z <- 2 * rbinom(n, 1, prob) - 1 X <- matrix(rep(lambda * Z, p) + rnorm(n * p), n) Y <- cos(X %*% B) + rnorm(n, 0, sigma) } else if (name == "M3") { if (missing(n)) { n <- 200 } # B ... `p x 1` B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1) X <- matrix(rnorm(n * p), n) Y <- 1.5 * log(2 + abs(X %*% B)) + rnorm(n, 0, sigma^2) } else if (name == "M4") { if (missing(n)) { n <- 200 } # B ... `p x 2` B <- cbind( c(rep(1 / sqrt(6), 6), rep(0, p - 6)), c(rep(c(1, -1), 3) / sqrt(6), rep(0, p - 6)) ) X <- rmvnorm(n, sigma = sigma^abs(outer(1:p, 1:p, FUN = `-`))) XB <- X %*% B Y <- (XB[, 1]) / (0.5 + (XB[, 2] + 1.5)^2) + rnorm(n, 0, sigma^2) } else if (name == "M5") { if (missing(n)) { n <- 200 } # B ... `p x 2` B <- cbind( c(rep(1, 6), rep(0, p - 6)), c(rep(c(1, -1), 3), rep(0, p - 6)) ) / sqrt(6) X <- matrix(runif(n * p), n) XB <- X %*% B Y <- cos(XB[, 1] * pi) * (XB[, 2] + 1)^2 + rnorm(n, 0, sigma^2) } else if (name == "M6") { if (missing(n)) { n <- 200 } # B ... `p x 3` B <- diag(p)[, -(3:(p - 1))] X <- matrix(rnorm(n * p), n) Y <- rowSums((X %*% B)^2) + rnorm(n, 0, sigma^2) } else if (name == "M7") { if (missing(n)) { n <- 400 } # B ... `p x 4` B <- diag(p)[, -(4:(p - 1))] # "R"andom "M"ulti"V"ariate "S"tudent X <- rmvt(n = n, sigma = diag(p), df = 3) XB <- X %*% B Y <- (XB[, 1]) * (XB[, 2])^2 + (XB[, 3]) * (XB[, 4]) Y <- Y + rlaplace(n, 0, sigma) } else { stop("Got unknown dataset name.") } return(list(X = X, Y = Y, B = B, name = name)) }