2
0
Fork 0
CVE/CVE_C/R/datasets.R

113 lines
3.9 KiB
R

#' Generates test datasets.
#'
#' Provides sample datasets. There are 5 different datasets named
#' M1, M2, M3, M4 and M5 described in the paper references below.
#' The general model is given by:
#' \deqn{Y = g(B'X) + \epsilon}
#'
#' @param name One of \code{"M1"}, \code{"M2"}, \code{"M3"}, \code{"M4"} or \code{"M5"}
#' @param n nr samples
#' @param B SDR basis used for dataset creation if supplied.
#' @param p Dim. of random variable \code{X}.
#' @param p.mix Only for \code{"M4"}, see: below.
#' @param lambda Only for \code{"M4"}, see: below.
#'
#' @return List with elements
#' \itemize{
#' \item{X}{data}
#' \item{Y}{response}
#' \item{B}{Used dim-reduction matrix}
#' \item{name}{Name of the dataset (name parameter)}
#' }
#'
#' @section M1:
#' The data follows \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, Sigma)} for a subspace
#' dimension of \eqn{k = 2} with a default of \eqn{n = 200} data points.
#' The link function \eqn{g} is given as
#' \deqn{g(x) = \frac{x_1}{0.5 + (x_2 + 1.5)^2} + \epsilon / 2}{%
#' g(x) = x_1 / (0.5 + (x_2 + 1.5)^2) + epsilon / 2}
#' @section M2:
#' \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, Sigma)} with \eqn{k = 2} with a
#' default of \eqn{n = 200} data points.
#' The link function \eqn{g} is given as
#' \deqn{g(x) = (b_1^T X) (b_2^T X)^2 + \epsilon / 2}
#' @section M3:
#' \deqn{g(x) = cos(b_1^T X) + \epsilon / 2}
#' @section M4:
#' TODO:
#' @section M5:
#' TODO:
#'
#' @import stats
#' @importFrom stats rnorm rbinom
#' @export
dataset <- function(name = "M1", n, B, p.mix = 0.3, lambda = 1.0) {
# validate parameters
stopifnot(name %in% c("M1", "M2", "M3", "M4", "M5"))
# set default values if not supplied
if (missing(n)) {
n <- if (name %in% c("M1", "M2")) 200 else if (name != "M5") 100 else 42
}
if (missing(B)) {
p <- 12
if (name == "M1") {
B <- cbind(
c( 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0),
c( 1,-1, 1,-1, 1,-1, 0, 0, 0, 0, 0, 0)
) / sqrt(6)
} else if (name == "M2") {
B <- cbind(
c(c(1, 0), rep(0, 10)),
c(c(0, 1), rep(0, 10))
)
} else {
B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, 6)), 12, 1)
}
} else {
p <- nrow(B)
# validate col. nr to match dataset `k = ncol(B)`
stopifnot(
name %in% c("M1", "M2") && ncol(B) == 2,
name %in% c("M3", "M4", "M5") && ncol(B) == 1
)
}
# set link function `g` for model `Y ~ g(B'X) + epsilon`
if (name == "M1") {
g <- function(BX) { BX[1] / (0.5 + (BX[2] + 1.5)^2) }
} else if (name == "M2") {
g <- function(BX) { BX[1] * BX[2]^2 }
} else if (name %in% c("M3", "M4")) {
g <- function(BX) { cos(BX[1]) }
} else { # name == "M5"
g <- function(BX) { 2 * log(abs(BX[1]) + 1) }
}
# compute X
if (name != "M4") {
# compute root of the covariance matrix according the dataset
if (name %in% c("M1", "M3")) {
# Variance-Covariance structure for `X ~ N_p(0, \Sigma)` with
# `\Sigma_{i, j} = 0.5^{|i - j|}`.
Sigma <- matrix(0.5^abs(kronecker(1:p, 1:p, '-')), p, p)
# decompose Sigma to Sigma.root^T Sigma.root = Sigma for usage in creation of `X`
Sigma.root <- chol(Sigma)
} else { # name %in% c("M2", "M5")
Sigma.root <- diag(rep(1, p)) # d-dim identity
}
# data `X` as multivariate random normal variable with
# variance matrix `Sigma`.
X <- replicate(p, rnorm(n, 0, 1)) %*% Sigma.root
} else { # name == "M4"
X <- t(replicate(100, rep((1 - 2 * rbinom(1, 1, p.mix)) * lambda, p) + rnorm(p, 0, 1)))
}
# responce `y ~ g(B'X) + epsilon` with `epsilon ~ N(0, 1 / 2)`
Y <- apply(X, 1, function(X_i) {
g(t(B) %*% X_i) + rnorm(1, 0, 0.5)
})
return(list(X = X, Y = Y, B = B, name = name))
}