194 lines
6.3 KiB
R
194 lines
6.3 KiB
R
#'
|
|
#' @param n number of samples.
|
|
#' @param mu mean
|
|
#' @param sigma covariance matrix.
|
|
#'
|
|
#' @returns a \eqn{n\times p} matrix with samples in its rows.
|
|
#'
|
|
#' @examples
|
|
#' rmvnorm(20, sigma = matrix(c(2, 1, 1, 2), 2))
|
|
#' rmvnorm(20, mu = c(3, -1, 2))
|
|
rmvnorm <- function(n = 1, mu = rep(0, p), sigma = diag(p)) {
|
|
if (!missing(sigma)) {
|
|
p <- nrow(sigma)
|
|
} else if (!missing(mu)) {
|
|
mu <- matrix(mu, ncol = 1)
|
|
p <- nrow(mu)
|
|
} else {
|
|
stop("At least one of 'mu' or 'sigma' must be supplied.")
|
|
}
|
|
|
|
# See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution
|
|
return(rep(mu, each = n) + matrix(rnorm(n * p), n) %*% chol(sigma))
|
|
}
|
|
|
|
#' Samples from the multivariate t distribution (student distribution).
|
|
#'
|
|
#' @param n number of samples.
|
|
#' @param mu mean, ... TODO:
|
|
#' @param sigma a \eqn{k\times k} positive definite matrix. If the degree
|
|
#' \eqn{\nu} if bigger than 2 the created covariance is
|
|
#' \deqn{var(x) = \Sigma\frac{\nu}{\nu - 2}}
|
|
#' for \eqn{\nu > 2}.
|
|
#' @param df degree of freedom \eqn{\nu}.
|
|
#'
|
|
#' @returns a \eqn{n\times p} matrix with samples in its rows.
|
|
#'
|
|
#' @examples
|
|
#' rmvt(20, c(0, 1), matrix(c(3, 1, 1, 2), 2), 3)
|
|
#' rmvt(20, sigma = matrix(c(2, 1, 1, 2), 2), 3)
|
|
#' rmvt(20, mu = c(3, -1, 2), 3)
|
|
rmvt <- function(n = 1, mu = rep(0, p), sigma = diag(p), df = Inf) {
|
|
if (!missing(sigma)) {
|
|
p <- nrow(sigma)
|
|
} else if (!missing(mu)) {
|
|
mu <- matrix(mu, ncol = 1)
|
|
p <- nrow(mu)
|
|
} else {
|
|
stop("At least one of 'mu' or 'sigma' must be supplied.")
|
|
}
|
|
|
|
if (df == Inf) {
|
|
Z <- 1
|
|
} else {
|
|
Z <- sqrt(df / rchisq(n, df))
|
|
}
|
|
|
|
return(rmvnorm(n, sigma = sigma) * Z + rep(mu, each = n))
|
|
}
|
|
|
|
|
|
#' Generalized Normal Distribution.
|
|
#' see: https://en.wikipedia.org/wiki/Generalized_normal_distribution
|
|
rgnorm <- function(n = 1, mu = 0, alpha = 1, beta = 1) {
|
|
if (alpha <= 0 | beta <= 0) {
|
|
stop("alpha and beta must be positive.")
|
|
}
|
|
lambda <- (1 / alpha)^beta
|
|
scales <- qgamma(runif(n), shape = 1 / beta, scale = 1 / lambda)^(1 / beta)
|
|
return(scales * ((-1)^rbinom(n, 1, 0.5)) + mu)
|
|
}
|
|
|
|
#' Laplace distribution
|
|
#' see: https://en.wikipedia.org/wiki/Laplace_distribution
|
|
rlaplace <- function(n = 1, mu = 0, sigma = 1) {
|
|
U <- runif(n, -0.5, 0.5)
|
|
scale <- sigma / sqrt(2)
|
|
|
|
return(mu - scale * sign(U) * log(1 - 2 * abs(U)))
|
|
}
|
|
|
|
#' Generates test datasets.
|
|
#'
|
|
#' Provides sample datasets. There are 5 different datasets named
|
|
#' M1, M2, M3, M4 and M5 described in the paper references below.
|
|
#' The general model is given by:
|
|
#' \deqn{Y = g(B'X) + \epsilon}
|
|
#'
|
|
#' @param name One of \code{"M1"}, \code{"M2"}, \code{"M3"}, \code{"M4"} or \code{"M5"}
|
|
#' @param n nr samples
|
|
#' @param B SDR basis used for dataset creation if supplied.
|
|
#' @param p Dim. of random variable \code{X}.
|
|
#' @param p.mix Only for \code{"M4"}, see: below.
|
|
#' @param lambda Only for \code{"M4"}, see: below.
|
|
#'
|
|
#' @return List with elements
|
|
#' \itemize{
|
|
#' \item{X}{data}
|
|
#' \item{Y}{response}
|
|
#' \item{B}{Used dim-reduction matrix}
|
|
#' \item{name}{Name of the dataset (name parameter)}
|
|
#' }
|
|
#'
|
|
#' @section M1:
|
|
#' The data follows \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, Sigma)} for a subspace
|
|
#' dimension of \eqn{k = 2} with a default of \eqn{n = 200} data points.
|
|
#' The link function \eqn{g} is given as
|
|
#' \deqn{g(x) = \frac{x_1}{0.5 + (x_2 + 1.5)^2} + \epsilon / 2}{%
|
|
#' g(x) = x_1 / (0.5 + (x_2 + 1.5)^2) + epsilon / 2}
|
|
#' @section M2:
|
|
#' \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, Sigma)} with \eqn{k = 2} with a
|
|
#' default of \eqn{n = 200} data points.
|
|
#' The link function \eqn{g} is given as
|
|
#' \deqn{g(x) = (b_1^T X) (b_2^T X)^2 + \epsilon / 2}
|
|
#' @section M3:
|
|
#' \deqn{g(x) = cos(b_1^T X) + \epsilon / 2}
|
|
#' @section M4:
|
|
#' TODO:
|
|
#' @section M5:
|
|
#' TODO:
|
|
#'
|
|
#' @import stats
|
|
#' @importFrom stats rnorm rbinom
|
|
#' @export
|
|
dataset <- function(name = "M1", n = NULL, p = 20, sigma = 0.5, ...) {
|
|
name <- toupper(name)
|
|
if (nchar(name) == 1) { name <- paste0("M", name) }
|
|
|
|
if (name == "M1") {
|
|
if (missing(n)) { n <- 100 }
|
|
# B ... `p x 1`
|
|
B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1)
|
|
X <- rmvnorm(n, sigma = sigma^abs(outer(1:p, 1:p, FUN = `-`)))
|
|
beta <- 0.5
|
|
Y <- cos(X %*% B) + rgnorm(n, 0,
|
|
alpha = sqrt(0.25 * gamma(1 / beta) / gamma(3 / beta)),
|
|
beta = beta
|
|
)
|
|
} else if (name == "M2") {
|
|
if (missing(n)) { n <- 100 }
|
|
prob <- 0.3
|
|
lambda <- 1 # dispersion
|
|
# B ... `p x 1`
|
|
B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1)
|
|
Z <- 2 * rbinom(n, 1, prob) - 1
|
|
X <- matrix(rep(lambda * Z, p) + rnorm(n * p), n)
|
|
Y <- cos(X %*% B) + rnorm(n, 0, sigma)
|
|
} else if (name == "M3") {
|
|
if (missing(n)) { n <- 200 }
|
|
# B ... `p x 1`
|
|
B <- matrix(c(rep(1 / sqrt(6), 6), rep(0, p - 6)), ncol = 1)
|
|
X <- matrix(rnorm(n * p), n)
|
|
Y <- 1.5 * log(2 + abs(X %*% B)) + rnorm(n, 0, sigma^2)
|
|
} else if (name == "M4") {
|
|
if (missing(n)) { n <- 200 }
|
|
# B ... `p x 2`
|
|
B <- cbind(
|
|
c(rep(1 / sqrt(6), 6), rep(0, p - 6)),
|
|
c(rep(c(1, -1), 3) / sqrt(6), rep(0, p - 6))
|
|
)
|
|
X <- rmvnorm(n, sigma = sigma^abs(outer(1:p, 1:p, FUN = `-`)))
|
|
XB <- X %*% B
|
|
Y <- (XB[, 1]) / (0.5 + (XB[, 2] + 1.5)^2) + rnorm(n, 0, sigma^2)
|
|
} else if (name == "M5") {
|
|
if (missing(n)) { n <- 200 }
|
|
# B ... `p x 2`
|
|
B <- cbind(
|
|
c(rep(1, 6), rep(0, p - 6)),
|
|
c(rep(c(1, -1), 3), rep(0, p - 6))
|
|
) / sqrt(6)
|
|
X <- matrix(runif(n * p), n)
|
|
XB <- X %*% B
|
|
Y <- cos(XB[, 1] * pi) * (XB[, 2] + 1)^2 + rnorm(n, 0, sigma^2)
|
|
} else if (name == "M6") {
|
|
if (missing(n)) { n <- 200 }
|
|
# B ... `p x 3`
|
|
B <- diag(p)[, -(3:(p - 1))]
|
|
X <- matrix(rnorm(n * p), n)
|
|
Y <- rowSums((X %*% B)^2) + rnorm(n, 0, sigma^2)
|
|
} else if (name == "M7") {
|
|
if (missing(n)) { n <- 400 }
|
|
# B ... `p x 4`
|
|
B <- diag(p)[, -(4:(p - 1))]
|
|
# "R"andom "M"ulti"V"ariate "S"tudent
|
|
X <- rmvt(n = n, sigma = diag(p), df = 3)
|
|
XB <- X %*% B
|
|
Y <- (XB[, 1]) * (XB[, 2])^2 + (XB[, 3]) * (XB[, 4])
|
|
Y <- Y + rlaplace(n, 0, sigma)
|
|
} else {
|
|
stop("Got unknown dataset name.")
|
|
}
|
|
|
|
return(list(X = X, Y = Y, B = B, name = name))
|
|
}
|