2
0
Fork 0
CVE/CVE_C/R/datasets.R

194 lines
6.3 KiB
R

#'
#' @param n number of samples.
#' @param mu mean
#' @param sigma covariance matrix.
#'
#' @returns a \eqn{n\times p} matrix with samples in its rows.
#'
#' @examples
#' rmvnorm(20, sigma = matrix(c(2, 1, 1, 2), 2))
#' rmvnorm(20, mu = c(3, -1, 2))
rmvnorm <- function(n = 1, mu = rep(0, p), sigma = diag(p)) {
if (!missing(sigma)) {
p <- nrow(sigma)
} else if (!missing(mu)) {
mu <- matrix(mu, ncol = 1)
p <- nrow(mu)
} else {
stop("At least one of 'mu' or 'sigma' must be supplied.")
}
# See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution
return(rep(mu, each = n) + matrix(rnorm(n * p), n) %*% chol(sigma))
}
#' Samples from the multivariate t distribution (student distribution).
#'
#' @param n number of samples.
#' @param mu mean, ... TODO:
#' @param sigma a \eqn{k\times k} positive definite matrix. If the degree
#' \eqn{\nu} if bigger than 2 the created covariance is
#' \deqn{var(x) = \Sigma\frac{\nu}{\nu - 2}}
#' for \eqn{\nu > 2}.
#' @param df degree of freedom \eqn{\nu}.
#'
#' @returns a \eqn{n\times p} matrix with samples in its rows.
#'
#' @examples
#' rmvt(20, c(0, 1), matrix(c(3, 1, 1, 2), 2), 3)
#' rmvt(20, sigma = matrix(c(2, 1, 1, 2), 2), 3)
#' rmvt(20, mu = c(3, -1, 2), 3)
rmvt <- function(n = 1, mu = rep(0, p), sigma = diag(p), df = Inf) {
if (!missing(sigma)) {
p <- nrow(sigma)
} else if (!missing(mu)) {
mu <- matrix(mu, ncol = 1)
p <- nrow(mu)
} else {
stop("At least one of 'mu' or 'sigma' must be supplied.")
}
if (df == Inf) {
Z <- 1
} else {
Z <- sqrt(df / rchisq(n, df))
}
return(rmvnorm(n, sigma = sigma) * Z + rep(mu, each = n))
}
#' Generalized Normal Distribution.
#' see: https://en.wikipedia.org/wiki/Generalized_normal_distribution
rgnorm <- function(n = 1, mu = 0, alpha = 1, beta = 1) {
if (alpha <= 0 | beta <= 0) {
stop("alpha and beta must be positive.")
}
lambda <- (1 / alpha)^beta
scales <- qgamma(runif(n), shape = 1 / beta, scale = 1 / lambda)^(1 / beta)
return(scales * ((-1)^rbinom(n, 1, 0.5)) + mu)
}
#' Laplace distribution
#' see: https://en.wikipedia.org/wiki/Laplace_distribution
rlaplace <- function(n = 1, mu = 0, sigma = 1) {
U <- runif(n, -0.5, 0.5)
scale <- sigma / sqrt(2)
return(mu - scale * sign(U) * log(1 - 2 * abs(U)))
}
#' Generates test datasets.
#'
#' Provides sample datasets. There are 5 different datasets named
#' M1, M2, M3, M4 and M5 described in the paper references below.
#' The general model is given by:
#' \deqn{Y = g(B'X) + \epsilon}
#'
#' @param name One of \code{"M1"}, \code{"M2"}, \code{"M3"}, \code{"M4"} or \code{"M5"}
#' @param n nr samples
#' @param B SDR basis used for dataset creation if supplied.
#' @param p Dim. of random variable \code{X}.
#' @param p.mix Only for \code{"M4"}, see: below.
#' @param lambda Only for \code{"M4"}, see: below.
#'
#' @return List with elements
#' \itemize{
#' \item{X}{data}
#' \item{Y}{response}
#' \item{B}{Used dim-reduction matrix}
#' \item{name}{Name of the dataset (name parameter)}
#' }
#'
#' @section M1:
#' The data follows \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, Sigma)} for a subspace
#' dimension of \eqn{k = 2} with a default of \eqn{n = 200} data points.
#' The link function \eqn{g} is given as
#' \deqn{g(x) = \frac{x_1}{0.5 + (x_2 + 1.5)^2} + \epsilon / 2}{%
#' g(x) = x_1 / (0.5 + (x_2 + 1.5)^2) + epsilon / 2}
#' @section M2:
#' \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, Sigma)} with \eqn{k = 2} with a
#' default of \eqn{n = 200} data points.
#' The link function \eqn{g} is given as
#' \deqn{g(x) = (b_1^T X) (b_2^T X)^2 + \epsilon / 2}
#' @section M3:
#' \deqn{g(x) = cos(b_1^T X) + \epsilon / 2}
#' @section M4:
#' TODO:
#' @section M5:
#' TODO:
#'
#' @import stats
#' @importFrom stats rnorm rbinom
#' @export
dataset <- function(name = "M1", n = NULL, p = 20, sigma = 0.5, ...) {
name <- toupper(name)
if (nchar(name) == 1) { name <- paste0("M", name) }
if (name == "M1") {
if (missing(n)) { n <- 100 }
# B ... `p x 1`
B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1)
X <- rmvnorm(n, sigma = sigma^abs(outer(1:p, 1:p, FUN = `-`)))
beta <- 0.5
Y <- cos(X %*% B) + rgnorm(n, 0,
alpha = sqrt(0.25 * gamma(1 / beta) / gamma(3 / beta)),
beta = beta
)
} else if (name == "M2") {
if (missing(n)) { n <- 100 }
prob <- 0.3
lambda <- 1 # dispersion
# B ... `p x 1`
B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1)
Z <- 2 * rbinom(n, 1, prob) - 1
X <- matrix(rep(lambda * Z, p) + rnorm(n * p), n)
Y <- cos(X %*% B) + rnorm(n, 0, sigma)
} else if (name == "M3") {
if (missing(n)) { n <- 200 }
# B ... `p x 1`
B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1)
X <- matrix(rnorm(n * p), n)
Y <- 1.5 * log(2 + abs(X %*% B)) + rnorm(n, 0, sigma^2)
} else if (name == "M4") {
if (missing(n)) { n <- 200 }
# B ... `p x 2`
B <- cbind(
c(rep(1 / sqrt(6), 6), rep(0, p - 6)),
c(rep(c(1, -1), 3) / sqrt(6), rep(0, p - 6))
)
X <- rmvnorm(n, sigma = sigma^abs(outer(1:p, 1:p, FUN = `-`)))
XB <- X %*% B
Y <- (XB[, 1]) / (0.5 + (XB[, 2] + 1.5)^2) + rnorm(n, 0, sigma^2)
} else if (name == "M5") {
if (missing(n)) { n <- 200 }
# B ... `p x 2`
B <- cbind(
c(rep(1, 6), rep(0, p - 6)),
c(rep(c(1, -1), 3), rep(0, p - 6))
) / sqrt(6)
X <- matrix(runif(n * p), n)
XB <- X %*% B
Y <- cos(XB[, 1] * pi) * (XB[, 2] + 1)^2 + rnorm(n, 0, sigma^2)
} else if (name == "M6") {
if (missing(n)) { n <- 200 }
# B ... `p x 3`
B <- diag(p)[, -(3:(p - 1))]
X <- matrix(rnorm(n * p), n)
Y <- rowSums((X %*% B)^2) + rnorm(n, 0, sigma^2)
} else if (name == "M7") {
if (missing(n)) { n <- 400 }
# B ... `p x 4`
B <- diag(p)[, -(4:(p - 1))]
# "R"andom "M"ulti"V"ariate "S"tudent
X <- rmvt(n = n, sigma = diag(p), df = 3)
XB <- X %*% B
Y <- (XB[, 1]) * (XB[, 2])^2 + (XB[, 3]) * (XB[, 4])
Y <- Y + rlaplace(n, 0, sigma)
} else {
stop("Got unknown dataset name.")
}
return(list(X = X, Y = Y, B = B, name = name))
}