2
0
Fork 0
CVE/CVarE/man/dataset.Rd

129 lines
5.1 KiB
R

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/datasets.R
\name{dataset}
\alias{dataset}
\title{Generates test datasets.}
\usage{
dataset(name = "M1", n = NULL, p = 20, sd = 0.5, ...)
}
\arguments{
\item{name}{One of \code{"M1"}, \code{"M2"}, \code{"M3"}, \code{"M4",}
\code{"M5"}, \code{"M6"} or \code{"M7"}. Alternative just the dataset number
1-7.}
\item{n}{number of samples.}
\item{p}{Dimension of random variable \eqn{X}.}
\item{sd}{standard diviation for error term \eqn{\epsilon}.}
\item{...}{Additional parameters only for "M2" (namely \code{pmix} and
\code{lambda}), see: below.}
}
\value{
List with elements
\itemize{
\item{X}{data, a \eqn{n\times p}{n x p} matrix.}
\item{Y}{response.}
\item{B}{the dim-reduction matrix}
\item{name}{Name of the dataset (name parameter)}
}
}
\description{
Provides sample datasets M1-M7 used in the paper Conditional variance
estimation for sufficient dimension reduction, Lukas Fertl, Efstathia Bura.
The general model is given by:
\deqn{Y = g(B'X) + \epsilon}
}
\section{M1}{
The predictors are distributed as
\eqn{X\sim N_p(0, \Sigma)}{X ~ N_p(0, \Sigma)} with
\eqn{\Sigma_{i, j} = 0.5^{|i - j|}}{\Sigma_ij = 0.5^|i - j|} for
\eqn{i, j = 1,..., p} for a subspace dimension of \eqn{k = 1} with a default
of \eqn{n = 100} data points. \eqn{p = 20},
\eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)}, and \eqn{Y} is
given as \deqn{Y = cos(b_1'X) + \epsilon} where \eqn{\epsilon} is
distributed as generalized normal distribution with location 0,
shape-parameter 0.5, and the scale-parameter is chosen such that
\eqn{Var(\epsilon) = 0.5}.
}
\section{M2}{
The predictors are distributed as \eqn{X \sim Z 1_p \lambda + N_p(0, I_p)}{X ~ Z 1_p \lambda + N_p(0, I_p)}. with
\eqn{Z \sim 2 Binom(p_{mix}) - 1\in\{-1, 1\}}{Z~2Binom(pmix)-1} where
\eqn{1_p} is the \eqn{p}-dimensional vector of one's, for a subspace
dimension of \eqn{k = 1} with a default of \eqn{n = 100} data points.
\eqn{p = 20}, \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)},
and \eqn{Y} is \deqn{Y = cos(b_1'X) + 0.5\epsilon} where \eqn{\epsilon} is
standard normal.
Defaults for \code{pmix} is 0.3 and \code{lambda} defaults to 1.
}
\section{M3}{
The predictors are distributed as \eqn{X\sim N_p(0, I_p)}{X~N_p(0, I_p)}
for a subspace
dimension of \eqn{k = 1} with a default of \eqn{n = 100} data points.
\eqn{p = 20}, \eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)},
and \eqn{Y} is
\deqn{Y = 2 log(|b_1'X| + 2) + 0.5\epsilon} where \eqn{\epsilon} is
standard normal.
}
\section{M4}{
The predictors are distributed as \eqn{X\sim N_p(0,\Sigma)}{X~N_p(0,\Sigma)}
with \eqn{\Sigma_{i, j} = 0.5^{|i - j|}}{\Sigma_ij = 0.5^|i - j|} for
\eqn{i, j = 1,..., p} for a subspace dimension of \eqn{k = 2} with a default
of \eqn{n = 100} data points. \eqn{p = 20},
\eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)},
\eqn{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / sqrt(6)}
and \eqn{Y} is given as \deqn{Y = \frac{b_1'X}{0.5 + (1.5 + b_2'X)^2} + 0.5\epsilon}{Y = (b_1'X) / (0.5 + (1.5 + b_2'X)^2) + 0.5\epsilon}
where \eqn{\epsilon} is standard normal.
}
\section{M5}{
The predictors are distributed as \eqn{X\sim U([0,1]^p)}{X~U([0, 1]^p)}
where \eqn{U([0, 1]^p)} is the uniform distribution with
independent components on the \eqn{p}-dimensional hypercube for a subspace
dimension of \eqn{k = 2} with a default of \eqn{n = 200} data points.
\eqn{p = 20},
\eqn{b_1 = (1,1,1,1,1,1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_1 = (1,1,1,1,1,1,0,...,0)' / sqrt(6)},
\eqn{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / \sqrt{6}\in\mathcal{R}^p}{b_2 = (1,-1,1,-1,1,-1,0,...,0)' / sqrt(6)}
and \eqn{Y} is given as \deqn{Y = cos(\pi b_1'X)(b_2'X + 1)^2 + 0.5\epsilon}
where \eqn{\epsilon} is standard normal.
}
\section{M6}{
The predictors are distributed as \eqn{X\sim N_p(0, I_p)}{X~N_p(0, I_p)}
for a subspace dimension of \eqn{k = 3} with a default of \eqn{n = 200} data
point. \eqn{p = 20, b_1 = e_1, b_2 = e_2}, and \eqn{b_3 = e_p}, where
\eqn{e_j} is the \eqn{j}-th unit vector in the \eqn{p}-dimensional space.
\eqn{Y} is given as \deqn{Y = (b_1'X)^2+(b_2'X)^2+(b_3'X)^2+0.5\epsilon}
where \eqn{\epsilon} is standard normal.
}
\section{M7}{
The predictors are distributed as \eqn{X\sim t_3(I_p)}{X~t_3(I_p)} where
\eqn{t_3(I_p)} is the standard multivariate t-distribution with 3 degrees of
freedom, for a subspace dimension of \eqn{k = 4} with a default of
\eqn{n = 200} data points.
\eqn{p = 20, b_1 = e_1, b_2 = e_2, b_3 = e_3}, and \eqn{b_4 = e_p}, where
\eqn{e_j} is the \eqn{j}-th unit vector in the \eqn{p}-dimensional space.
\eqn{Y} is given as \deqn{Y = (b_1'X)(b_2'X)^2+(b_3'X)(b_4'X)+0.5\epsilon}
where \eqn{\epsilon} is distributed as generalized normal distribution with
location 0, shape-parameter 1, and the scale-parameter is chosen such that
\eqn{Var(\epsilon) = 0.25}.
}
\references{
Fertl, L. and Bura, E. (2021), Conditional Variance
Estimation for Sufficient Dimension Reduction.
arXiv:2102.08782
}