#' Generates test datasets. #' #' Provides sample datasets. There are 5 different datasets named #' M1, M2, M3, M4 and M5 described in the paper references below. #' The general model is given by: #' \deqn{Y ~ g(B'X) + \epsilon} #' #' @param name One of \code{"M1"}, \code{"M2"}, \code{"M3"}, \code{"M4"} or \code{"M5"} #' @param n nr samples #' @param p Dim. of random variable \code{X}. #' @param p.mix Only for \code{"M4"}, see: below. #' @param lambda Only for \code{"M4"}, see: below. #' #' @return List with elements #' \itemize{ #' \item{X}{data} #' \item{Y}{response} #' \item{B}{Used dim-reduction matrix} #' \item{name}{Name of the dataset (name parameter)} #' } #' #' @section M1: #' The data follows \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, Sigma)} for a subspace #' dimension of \eqn{k = 2} with a default of \eqn{n = 200} data points. #' The link function \eqn{g} is given as #' \deqn{g(x) = \frac{x_1}{0.5 + (x_2 + 1.5)^2} + \epsilon / 2}{% #' g(x) = x_1 / (0.5 + (x_2 + 1.5)^2) + epsilon / 2} #' @section M2: #' \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, Sigma)} with \eqn{k = 2} with a #' default of \eqn{n = 200} data points. #' The link function \eqn{g} is given as #' \deqn{g(x) = (b_1^T X) (b_2^T X)^2 + \epsilon / 2} #' @section M3: #' \deqn{g(x) = cos(b_1^T X) + \epsilon / 2} #' @section M4: #' TODO: #' @section M5: #' TODO: #' #' @import stats #' @importFrom stats rnorm rbinom #' @export dataset <- function(name = "M1", n, B, p.mix = 0.3, lambda = 1.0) { # validate parameters stopifnot(name %in% c("M1", "M2", "M3", "M4", "M5")) # set default values if not supplied if (missing(n)) { n <- if (name %in% c("M1", "M2")) 200 else if (name != "M5") 100 else 42 } if (missing(B)) { p <- 12 if (name == "M1") { B <- cbind( c( 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0), c( 1,-1, 1,-1, 1,-1, 0, 0, 0, 0, 0, 0) ) / sqrt(6) } else if (name == "M2") { B <- cbind( c(c(1, 0), rep(0, 10)), c(c(0, 1), rep(0, 10)) ) } else { B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, 6)), 12, 1) } } else { p <- dim(B)[1] # validate col. nr to match dataset `k = dim(B)[2]` stopifnot( name %in% c("M1", "M2") && dim(B)[2] == 2, name %in% c("M3", "M4", "M5") && dim(B)[2] == 1 ) } # set link function `g` for model `Y ~ g(B'X) + epsilon` if (name == "M1") { g <- function(BX) { BX[1] / (0.5 + (BX[2] + 1.5)^2) } } else if (name == "M2") { g <- function(BX) { BX[1] * BX[2]^2 } } else if (name %in% c("M3", "M4")) { g <- function(BX) { cos(BX[1]) } } else { # name == "M5" g <- function(BX) { 2 * log(abs(BX[1]) + 1) } } # compute X if (name != "M4") { # compute root of the covariance matrix according the dataset if (name %in% c("M1", "M3")) { # Variance-Covariance structure for `X ~ N_p(0, \Sigma)` with # `\Sigma_{i, j} = 0.5^{|i - j|}`. Sigma <- matrix(0.5^abs(kronecker(1:p, 1:p, '-')), p, p) # decompose Sigma to Sigma.root^T Sigma.root = Sigma for usage in creation of `X` Sigma.root <- chol(Sigma) } else { # name %in% c("M2", "M5") Sigma.root <- diag(rep(1, p)) # d-dim identity } # data `X` as multivariate random normal variable with # variance matrix `Sigma`. X <- replicate(p, rnorm(n, 0, 1)) %*% Sigma.root } else { # name == "M4" X <- t(replicate(100, rep((1 - 2 * rbinom(1, 1, p.mix)) * lambda, p) + rnorm(p, 0, 1))) } # responce `y ~ g(B'X) + epsilon` with `epsilon ~ N(0, 1 / 2)` Y <- apply(X, 1, function(X_i) { g(t(B) %*% X_i) + rnorm(1, 0, 0.5) }) return(list(X = X, Y = Y, B = B, name = name)) }