% Generated by roxygen2: do not edit by hand % Please edit documentation in R/datasets.R \name{dataset} \alias{dataset} \title{Generates test datasets.} \usage{ dataset(name = "M1", n = NULL, p = 20, sd = 0.5, ...) } \arguments{ \item{name}{One of \code{"M1"}, \code{"M2"}, \code{"M3"}, \code{"M4",} \code{"M5"}, \code{"M6"} or \code{"M7"}. Alternative just the dataset number 1-7.} \item{n}{number of samples.} \item{p}{Dimension of random variable \eqn{X}.} \item{sd}{standard diviation for error term \eqn{\epsilon}.} \item{...}{Additional parameters only for "M2" (namely \code{pmix} and \code{lambda}), see: below.} } \value{ List with elements \itemize{ \item{X}{data, a \eqn{n\times p}{n x p} matrix.} \item{Y}{response.} \item{B}{the dim-reduction matrix} \item{name}{Name of the dataset (name parameter)} } } \description{ Provides sample datasets M1-M7 used in the paper Conditional variance estimation for sufficient dimension reduction, Lukas Fertl, Efstathia Bura. The general model is given by: \deqn{Y = g(B'X) + \epsilon} } \section{M1}{ The predictors are distributed as \eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, \Sigma)} with \eqn{\Sigma_{i, j} = 0.5^{|i - j|}}{\Sigma_ij = 0.5^|i - j|} for \eqn{i, j = 1,..., p} for a subspace dimension of \eqn{k = 1} with a default of \eqn{n = 100} data points. \eqn{p = 20}, \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)}, and \eqn{Y} is given as \deqn{Y = cos(b_1'X) + \epsilon} where \eqn{\epsilon} is distributed as generalized normal distribution with location 0, shape-parameter 0.5, and the scale-parameter is chosen such that \eqn{Var(\epsilon) = 0.5}. } \section{M2}{ The predictors are distributed as \eqn{X \sim Z 1_p \lambda + N_p(0, I_p)}{X ~ Z 1_p \lambda + N_p(0, I_p)}. with \eqn{Z \sim 2 Binom(p_{mix}) - 1\in\{-1, 1\}}{Z~2Binom(pmix)-1} where \eqn{1_p} is the \eqn{p}-dimensional vector of one's, for a subspace dimension of \eqn{k = 1} with a default of \eqn{n = 100} data points. \eqn{p = 20}, \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)}, and \eqn{Y} is \deqn{Y = cos(b_1'X) + 0.5\epsilon} where \eqn{\epsilon} is standard normal. Defaults for \code{pmix} is 0.3 and \code{lambda} defaults to 1. } \section{M3}{ The predictors are distributed as \eqn{X\sim N_p(0, I_p)}{X~N_p(0, I_p)} for a subspace dimension of \eqn{k = 1} with a default of \eqn{n = 100} data points. \eqn{p = 20}, \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)}, and \eqn{Y} is \deqn{Y = 2 log(|b_1'X| + 2) + 0.5\epsilon} where \eqn{\epsilon} is standard normal. } \section{M4}{ The predictors are distributed as \eqn{X\sim N_p(0,\Sigma)}{X~N_p(0,\Sigma)} with \eqn{\Sigma_{i, j} = 0.5^{|i - j|}}{\Sigma_ij = 0.5^|i - j|} for \eqn{i, j = 1,..., p} for a subspace dimension of \eqn{k = 2} with a default of \eqn{n = 100} data points. \eqn{p = 20}, \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)}, \eqn{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / sqrt(6)} and \eqn{Y} is given as \deqn{Y = \frac{b_1'X}{0.5 + (1.5 + b_2'X)^2} + 0.5\epsilon}{Y = (b_1'X) / (0.5 + (1.5 + b_2'X)^2) + 0.5\epsilon} where \eqn{\epsilon} is standard normal. } \section{M5}{ The predictors are distributed as \eqn{X\sim U([0,1]^p)}{X~U([0, 1]^p)} where \eqn{U([0, 1]^p)} is the uniform distribution with independent components on the \eqn{p}-dimensional hypercube for a subspace dimension of \eqn{k = 2} with a default of \eqn{n = 200} data points. \eqn{p = 20}, \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)}, \eqn{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / sqrt(6)} and \eqn{Y} is given as \deqn{Y = cos(\pi b_1'X)(b_2'X + 1)^2 + 0.5\epsilon} where \eqn{\epsilon} is standard normal. } \section{M6}{ The predictors are distributed as \eqn{X\sim N_p(0, I_p)}{X~N_p(0, I_p)} for a subspace dimension of \eqn{k = 3} with a default of \eqn{n = 200} data point. \eqn{p = 20, b_1 = e_1, b_2 = e_2}, and \eqn{b_3 = e_p}, where \eqn{e_j} is the \eqn{j}-th unit vector in the \eqn{p}-dimensional space. \eqn{Y} is given as \deqn{Y = (b_1'X)^2+(b_2'X)^2+(b_3'X)^2+0.5\epsilon} where \eqn{\epsilon} is standard normal. } \section{M7}{ The predictors are distributed as \eqn{X\sim t_3(I_p)}{X~t_3(I_p)} where \eqn{t_3(I_p)} is the standard multivariate t-distribution with 3 degrees of freedom, for a subspace dimension of \eqn{k = 4} with a default of \eqn{n = 200} data points. \eqn{p = 20, b_1 = e_1, b_2 = e_2, b_3 = e_3}, and \eqn{b_4 = e_p}, where \eqn{e_j} is the \eqn{j}-th unit vector in the \eqn{p}-dimensional space. \eqn{Y} is given as \deqn{Y = (b_1'X)(b_2'X)^2+(b_3'X)(b_4'X)+0.5\epsilon} where \eqn{\epsilon} is distributed as generalized normal distribution with location 0, shape-parameter 1, and the scale-parameter is chosen such that \eqn{Var(\epsilon) = 0.25}. } \references{ Fertl, L. and Bura, E. (2021) "Conditional Variance Estimation for Sufficient Dimension Reduction" }