112 lines
3.8 KiB
R
112 lines
3.8 KiB
R
#' Generates test datasets.
|
|
#'
|
|
#' Provides sample datasets. There are 5 different datasets named
|
|
#' M1, M2, M3, M4 and M5 describet in the paper references below.
|
|
#' The general model is given by:
|
|
#' \deqn{Y ~ g(B'X) + \epsilon}
|
|
#'
|
|
#' @param name One of \code{"M1"}, \code{"M2"}, \code{"M3"}, \code{"M4"} or \code{"M5"}
|
|
#' @param n nr samples
|
|
#' @param p Dim. of random variable \code{X}.
|
|
#' @param p.mix Only for \code{"M4"}, see: below.
|
|
#' @param lambda Only for \code{"M4"}, see: below.
|
|
#'
|
|
#' @return List with elements
|
|
#' \itemize{
|
|
#' \item{X}{data}
|
|
#' \item{Y}{response}
|
|
#' \item{B}{Used dim-reduction matrix}
|
|
#' \item{name}{Name of the dataset (name parameter)}
|
|
#' }
|
|
#'
|
|
#' @section M1:
|
|
#' The data follows \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, Sigma)} for a subspace
|
|
#' dimension of \eqn{k = 2} with a default of \eqn{n = 200} data points.
|
|
#' The link function \eqn{g} is given as
|
|
#' \deqn{g(x) = \frac{x_1}{0.5 + (x_2 + 1.5)^2} + \epsilon / 2}{%
|
|
#' g(x) = x_1 / (0.5 + (x_2 + 1.5)^2) + epsilon / 2}
|
|
#' @section M2:
|
|
#' \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, Sigma)} with \eqn{k = 2} with a
|
|
#' default of \eqn{n = 200} data points.
|
|
#' The link function \eqn{g} is given as
|
|
#' \deqn{g(x) = (b_1^T X) (b_2^T X)^2 + \epsilon / 2}
|
|
#' @section M3:
|
|
#' \deqn{g(x) = cos(b_1^T X) + \epsilon / 2}
|
|
#' @section M4:
|
|
#' TODO:
|
|
#' @section M5:
|
|
#' TODO:
|
|
#'
|
|
#' @import stats
|
|
#' @importFrom stats rnorm rbinom
|
|
#' @export
|
|
dataset <- function(name = "M1", n, B, p.mix = 0.3, lambda = 1.0) {
|
|
# validate parameters
|
|
stopifnot(name %in% c("M1", "M2", "M3", "M4", "M5"))
|
|
|
|
# set default values if not supplied
|
|
if (missing(n)) {
|
|
n <- if (name %in% c("M1", "M2")) 200 else if (name != "M5") 100 else 42
|
|
}
|
|
if (missing(B)) {
|
|
p <- 12
|
|
if (name == "M1") {
|
|
B <- cbind(
|
|
c( 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0),
|
|
c( 1,-1, 1,-1, 1,-1, 0, 0, 0, 0, 0, 0)
|
|
) / sqrt(6)
|
|
} else if (name == "M2") {
|
|
B <- cbind(
|
|
c(c(1, 0), rep(0, 10)),
|
|
c(c(0, 1), rep(0, 10))
|
|
)
|
|
} else {
|
|
B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, 6)), 12, 1)
|
|
}
|
|
} else {
|
|
p <- dim(B)[1]
|
|
# validate col. nr to match dataset `k = dim(B)[2]`
|
|
stopifnot(
|
|
name %in% c("M1", "M2") && dim(B)[2] == 2,
|
|
name %in% c("M3", "M4", "M5") && dim(B)[2] == 1
|
|
)
|
|
}
|
|
|
|
# set link function `g` for model `Y ~ g(B'X) + epsilon`
|
|
if (name == "M1") {
|
|
g <- function(BX) { BX[1] / (0.5 + (BX[2] + 1.5)^2) }
|
|
} else if (name == "M2") {
|
|
g <- function(BX) { BX[1] * BX[2]^2 }
|
|
} else if (name %in% c("M3", "M4")) {
|
|
g <- function(BX) { cos(BX[1]) }
|
|
} else { # name == "M5"
|
|
g <- function(BX) { 2 * log(abs(BX[1]) + 1) }
|
|
}
|
|
|
|
# compute X
|
|
if (name != "M4") {
|
|
# compute root of the covariance matrix according the dataset
|
|
if (name %in% c("M1", "M3")) {
|
|
# Variance-Covariance structure for `X ~ N_p(0, \Sigma)` with
|
|
# `\Sigma_{i, j} = 0.5^{|i - j|}`.
|
|
Sigma <- matrix(0.5^abs(kronecker(1:p, 1:p, '-')), p, p)
|
|
# decompose Sigma to Sigma.root^T Sigma.root = Sigma for usage in creation of `X`
|
|
Sigma.root <- chol(Sigma)
|
|
} else { # name %in% c("M2", "M5")
|
|
Sigma.root <- diag(rep(1, p)) # d-dim identity
|
|
}
|
|
# data `X` as multivariate random normal variable with
|
|
# variance matrix `Sigma`.
|
|
X <- replicate(p, rnorm(n, 0, 1)) %*% Sigma.root
|
|
} else { # name == "M4"
|
|
X <- t(replicate(100, rep((1 - 2 * rbinom(1, 1, p.mix)) * lambda, p) + rnorm(p, 0, 1)))
|
|
}
|
|
|
|
# responce `y ~ g(B'X) + epsilon` with `epsilon ~ N(0, 1 / 2)`
|
|
Y <- apply(X, 1, function(X_i) {
|
|
g(t(B) %*% X_i) + rnorm(1, 0, 0.5)
|
|
})
|
|
|
|
return(list(X = X, Y = Y, B = B, name = name))
|
|
}
|