2019-10-18 07:06:36 +00:00
|
|
|
#' Conditional Variance Estimator (CVE) Package.
|
2019-09-16 09:15:51 +00:00
|
|
|
#'
|
2019-10-18 07:06:36 +00:00
|
|
|
#' Conditional Variance Estimation (CVE) is a novel sufficient dimension
|
|
|
|
#' reduction (SDR) method for regressions satisfying \eqn{E(Y|X) = E(Y|B'X)},
|
|
|
|
#' where \eqn{B'X} is a lower dimensional projection of the predictors. CVE,
|
|
|
|
#' similarly to its main competitor, the mean average variance estimation
|
|
|
|
#' (MAVE), is not based on inverse regression, and does not require the
|
|
|
|
#' restrictive linearity and constant variance conditions of moment based SDR
|
|
|
|
#' methods. CVE is data-driven and applies to additive error regressions with
|
|
|
|
#' continuous predictors and link function. The effectiveness and accuracy of
|
|
|
|
#' CVE compared to MAVE and other SDR techniques is demonstrated in simulation
|
|
|
|
#' studies. CVE is shown to outperform MAVE in some model set-ups, while it
|
|
|
|
#' remains largely on par under most others.
|
|
|
|
#' Let \eqn{Y} be real denotes a univariate response and \eqn{X} a real
|
|
|
|
#' \eqn{p}-dimensional covariate vector. We assume that the dependence of
|
|
|
|
#' \eqn{Y} and \eqn{X} is modelled by
|
|
|
|
#' \deqn{Y = g(B'X) + \epsilon}
|
|
|
|
#' where \eqn{X} is independent of \eqn{\epsilon} with positive definite
|
|
|
|
#' variance-covariance matrix \eqn{Var(X) = \Sigma_X}. \eqn{\epsilon} is a mean
|
|
|
|
#' zero random variable with finite \eqn{Var(\epsilon) = E(\epsilon^2)}, \eqn{g}
|
|
|
|
#' is an unknown, continuous non-constant function,
|
|
|
|
#' and \eqn{B = (b_1, ..., b_k)} is
|
|
|
|
#' a real \eqn{p \times k}{p x k} of rank \eqn{k <= p}{k \leq p}.
|
|
|
|
#' Without loss of generality \eqn{B} is assumed to be orthonormal.
|
2019-09-16 09:15:51 +00:00
|
|
|
#'
|
2019-10-18 07:06:36 +00:00
|
|
|
#' @author Daniel Kapla, Lukas Fertl, Bura Efstathia
|
2019-11-25 19:49:43 +00:00
|
|
|
#' @references Fertl Lukas, Bura Efstathia. (2019), Conditional Variance
|
|
|
|
#' Estimation for Sufficient Dimension Reduction. Working Paper.
|
2019-09-16 09:15:51 +00:00
|
|
|
#'
|
|
|
|
#' @docType package
|
|
|
|
#' @useDynLib CVE, .registration = TRUE
|
|
|
|
"_PACKAGE"
|
|
|
|
|
2019-10-18 07:06:36 +00:00
|
|
|
#' Conditional Variance Estimator (CVE).
|
2019-09-16 09:15:51 +00:00
|
|
|
#'
|
2019-11-25 19:49:43 +00:00
|
|
|
#' @inherit CVE-package description
|
2019-09-16 09:15:51 +00:00
|
|
|
#'
|
2019-10-18 07:06:36 +00:00
|
|
|
#' @param formula an object of class \code{"formula"} which is a symbolic
|
|
|
|
#' description of the model to be fitted.
|
|
|
|
#' @param data an optional data frame, containing the data for the formula if
|
|
|
|
#' supplied.
|
|
|
|
#' @param method specifies the CVE method variation as one of
|
2019-09-16 09:15:51 +00:00
|
|
|
#' \itemize{
|
2019-10-22 08:33:41 +00:00
|
|
|
#' \item "simple" exact implementation as described in the paper listed
|
2019-10-18 07:06:36 +00:00
|
|
|
#' below.
|
|
|
|
#' \item "weighted" variation with addaptive weighting of slices.
|
2019-09-16 09:15:51 +00:00
|
|
|
#' }
|
2019-10-18 07:06:36 +00:00
|
|
|
#' @param ... Parameters passed on to \code{cve.call}.
|
2019-11-20 18:03:21 +00:00
|
|
|
#'
|
2019-11-25 19:49:43 +00:00
|
|
|
#' @return an S3 object of class \code{cve} with components:
|
|
|
|
#' \describe{
|
|
|
|
#' \item{X}{Original training data,}
|
|
|
|
#' \item{Y}{Responce of original training data,}
|
|
|
|
#' \item{method}{Name of used method,}
|
|
|
|
#' \item{call}{the matched call,}
|
|
|
|
#' \item{res}{list of components \code{V, L, B, loss, h} and \code{k} for
|
|
|
|
#' each \eqn{k=min.dim,...,max.dim} (dimension).}
|
2019-11-20 18:03:21 +00:00
|
|
|
#' }
|
|
|
|
#'
|
2019-09-16 09:15:51 +00:00
|
|
|
#' @examples
|
2019-10-18 07:06:36 +00:00
|
|
|
#' # create dataset
|
2019-11-22 08:32:14 +00:00
|
|
|
#' x <- matrix(rnorm(400), 100, 4)
|
|
|
|
#' y <- x[, 1] + x[, 2] + as.matrix(rnorm(100))
|
2019-09-16 09:15:51 +00:00
|
|
|
#'
|
2019-11-25 19:49:43 +00:00
|
|
|
#' # Call CVE.
|
|
|
|
#' dr <- cve(y ~ x)
|
2019-11-22 08:32:14 +00:00
|
|
|
#' # Call weighted CVE.
|
|
|
|
#' dr.weighted <- cve(y ~ x, method = "weighted")
|
2019-09-16 09:15:51 +00:00
|
|
|
#'
|
2019-11-25 19:49:43 +00:00
|
|
|
#' # Training data responces of reduced data.
|
|
|
|
#' y.est <- directions(dr, 1)
|
|
|
|
#' # Extract SDR subspace basis of dimension 1.
|
|
|
|
#' B <- coef(dr.momentum, 1)
|
|
|
|
#'
|
2019-10-22 08:33:41 +00:00
|
|
|
#' @seealso For a detailed description of \code{formula} see
|
2019-11-22 08:32:14 +00:00
|
|
|
#' \code{\link{formula}}.
|
2019-11-25 19:49:43 +00:00
|
|
|
#' @references Fertl Lukas, Bura Efstathia. (2019), Conditional Variance
|
|
|
|
#' Estimation for Sufficient Dimension Reduction. Working Paper.
|
|
|
|
#'
|
|
|
|
#' @importFrom stats model.frame
|
2019-09-16 09:15:51 +00:00
|
|
|
#' @export
|
2019-09-25 11:53:45 +00:00
|
|
|
cve <- function(formula, data, method = "simple", max.dim = 10L, ...) {
|
2019-09-16 09:15:51 +00:00
|
|
|
# check for type of `data` if supplied and set default
|
|
|
|
if (missing(data)) {
|
|
|
|
data <- environment(formula)
|
|
|
|
} else if (!is.data.frame(data)) {
|
2019-09-25 11:53:45 +00:00
|
|
|
stop("Parameter 'data' must be a 'data.frame' or missing.")
|
2019-09-16 09:15:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
# extract `X`, `Y` from `formula` with `data`
|
|
|
|
model <- stats::model.frame(formula, data)
|
2019-09-25 11:53:45 +00:00
|
|
|
X <- as.matrix(model[ ,-1L, drop = FALSE])
|
|
|
|
Y <- as.double(model[ , 1L])
|
2019-09-16 09:15:51 +00:00
|
|
|
|
|
|
|
# pass extracted data on to [cve.call()]
|
2019-09-25 11:53:45 +00:00
|
|
|
dr <- cve.call(X, Y, method = method, max.dim = max.dim, ...)
|
2019-09-16 09:15:51 +00:00
|
|
|
|
|
|
|
# overwrite `call` property from [cve.call()]
|
|
|
|
dr$call <- match.call()
|
|
|
|
return(dr)
|
|
|
|
}
|
|
|
|
|
2019-11-25 19:49:43 +00:00
|
|
|
#' @inherit cve title
|
|
|
|
#' @inherit cve description
|
|
|
|
#'
|
2019-10-18 07:06:36 +00:00
|
|
|
#' @param nObs parameter for choosing bandwidth \code{h} using
|
|
|
|
#' \code{\link{estimate.bandwidth}} (ignored if \code{h} is supplied).
|
|
|
|
#' @param X data matrix with samples in its rows.
|
2019-10-22 08:33:41 +00:00
|
|
|
#' @param Y Responses (1 dimensional).
|
2019-11-20 18:03:21 +00:00
|
|
|
#' @param k Dimension of lower dimensional projection, if \code{k} is given
|
|
|
|
#' only the specified dimension \code{B} matrix is estimated.
|
2019-10-18 07:06:36 +00:00
|
|
|
#' @param min.dim lower bounds for \code{k}, (ignored if \code{k} is supplied).
|
|
|
|
#' @param max.dim upper bounds for \code{k}, (ignored if \code{k} is supplied).
|
|
|
|
#' @param tau Initial step-size.
|
|
|
|
#' @param tol Tolerance for break condition.
|
2019-11-22 08:32:14 +00:00
|
|
|
#' @param max.iter maximum number of optimization steps.
|
2019-10-18 07:06:36 +00:00
|
|
|
#' @param attempts number of arbitrary different starting points.
|
2019-11-20 18:03:21 +00:00
|
|
|
#' @param logger a logger function (only for advanced user, significantly slows
|
|
|
|
#' down the computation).
|
2019-11-22 08:32:14 +00:00
|
|
|
#' @param h bandwidth or function to estimate bandwidth, defaults to internaly
|
|
|
|
#' estimated bandwidth.
|
|
|
|
#' @param momentum number of [0, 1) giving the ration of momentum for eucledian
|
|
|
|
#' gradient update with a momentum term.
|
|
|
|
#' @param slack Positive scaling to allow small increases of the loss while
|
|
|
|
#' optimizing.
|
|
|
|
#' @param gamma step-size reduction multiple.
|
|
|
|
#' @param V.init Semi-orthogonal matrix of dimensions `(ncol(X), ncol(X) - k)` #' as optimization starting value. (If supplied, \code{attempts} is
|
|
|
|
#' set to 1 and \code{k} to match dimension)
|
2019-11-20 18:03:21 +00:00
|
|
|
#'
|
2019-11-25 19:49:43 +00:00
|
|
|
#' @inherit cve return
|
|
|
|
#'
|
|
|
|
#' @examples
|
|
|
|
#' # Create a dataset (n samples):
|
|
|
|
#' n <- 100
|
|
|
|
#' X <- matrix(rnorm(4 * n), n)
|
|
|
|
#' Y <- matrix(X[, 1] + cos(X[, 2]) + rnorm(n, 0, .1), n)
|
|
|
|
#'
|
|
|
|
#' # Create logger function:
|
|
|
|
#' logger <- function(attempt, iter, data) {
|
|
|
|
#' if (iter == 0) {
|
|
|
|
#' cat("Starting attempt nr:", attempt, "\n")
|
|
|
|
#' }
|
|
|
|
#' cat(" iter:", iter, "loss:", data$loss, "\n")
|
2019-11-20 18:03:21 +00:00
|
|
|
#' }
|
|
|
|
#'
|
2019-11-25 19:49:43 +00:00
|
|
|
#' Call 'cve' with logger:
|
|
|
|
#' cve(Y ~ X, logger = logger)
|
|
|
|
#'
|
2019-09-16 09:15:51 +00:00
|
|
|
#' @export
|
2019-09-25 11:53:45 +00:00
|
|
|
cve.call <- function(X, Y, method = "simple",
|
|
|
|
nObs = sqrt(nrow(X)), h = NULL,
|
|
|
|
min.dim = 1L, max.dim = 10L, k = NULL,
|
2019-11-20 18:03:21 +00:00
|
|
|
momentum = 0.0, tau = 1.0, tol = 1e-3,
|
|
|
|
slack = 0.0, gamma = 0.5,
|
|
|
|
V.init = NULL,
|
2019-11-22 08:32:14 +00:00
|
|
|
max.iter = 50L, attempts = 10L,
|
2019-09-25 11:53:45 +00:00
|
|
|
logger = NULL) {
|
2019-11-25 19:49:43 +00:00
|
|
|
# Determine method with partial matching (shortcuts: "Weight" -> "weighted")
|
2019-11-20 18:03:21 +00:00
|
|
|
methods <- list(
|
|
|
|
"simple" = 0L,
|
|
|
|
"weighted" = 1L
|
|
|
|
)
|
2019-11-25 19:49:43 +00:00
|
|
|
method_nr <- methods[[tolower(method), exact = FALSE]]
|
|
|
|
if (!is.integer(method_nr)) {
|
|
|
|
stop('Unable to determine method.')
|
2019-11-20 18:03:21 +00:00
|
|
|
}
|
2019-11-25 19:49:43 +00:00
|
|
|
method <- names(which(method_nr == methods))
|
2019-09-16 09:15:51 +00:00
|
|
|
|
|
|
|
# parameter checking
|
2019-11-20 18:03:21 +00:00
|
|
|
if (!is.numeric(momentum) || length(momentum) > 1L) {
|
|
|
|
stop("Momentum must be a number.")
|
|
|
|
}
|
|
|
|
if (!is.double(momentum)) {
|
|
|
|
momentum <- as.double(momentum)
|
|
|
|
}
|
|
|
|
if (momentum < 0.0 || momentum >= 1.0) {
|
|
|
|
stop("Momentum must be in [0, 1).")
|
|
|
|
}
|
|
|
|
|
2019-09-25 11:53:45 +00:00
|
|
|
if (!(is.matrix(X) && is.numeric(X))) {
|
|
|
|
stop("Parameter 'X' should be a numeric matrices.")
|
2019-09-16 09:15:51 +00:00
|
|
|
}
|
2019-09-25 11:53:45 +00:00
|
|
|
if (!is.numeric(Y)) {
|
|
|
|
stop("Parameter 'Y' must be numeric.")
|
|
|
|
}
|
|
|
|
if (is.matrix(Y) || !is.double(Y)) {
|
|
|
|
Y <- as.double(Y)
|
2019-09-16 09:15:51 +00:00
|
|
|
}
|
|
|
|
if (nrow(X) != length(Y)) {
|
2019-09-25 11:53:45 +00:00
|
|
|
stop("Rows of 'X' and 'Y' elements are not compatible.")
|
2019-09-16 09:15:51 +00:00
|
|
|
}
|
|
|
|
if (ncol(X) < 2) {
|
2019-09-25 11:53:45 +00:00
|
|
|
stop("'X' is one dimensional, no need for dimension reduction.")
|
2019-09-16 09:15:51 +00:00
|
|
|
}
|
|
|
|
|
2019-11-20 18:03:21 +00:00
|
|
|
if (!is.null(V.init)) {
|
|
|
|
if (!is.matrix(V.init)) {
|
|
|
|
stop("'V.init' must be a matrix.")
|
|
|
|
}
|
|
|
|
if (!all.equal(crossprod(V.init), diag(1, ncol(V.init)))) {
|
|
|
|
stop("'V.init' must be semi-orthogonal.")
|
|
|
|
}
|
|
|
|
if (ncol(X) != nrow(V.init) || ncol(X) <= ncol(V.init)) {
|
|
|
|
stop("Dimension missmatch of 'V.init' and 'X'")
|
|
|
|
}
|
|
|
|
min.dim <- max.dim <- ncol(X) - ncol(V.init)
|
|
|
|
attempts <- 0L
|
|
|
|
} else if (missing(k) || is.null(k)) {
|
2019-09-16 09:15:51 +00:00
|
|
|
min.dim <- as.integer(min.dim)
|
|
|
|
max.dim <- as.integer(min(max.dim, ncol(X) - 1L))
|
2019-09-25 11:53:45 +00:00
|
|
|
} else {
|
2019-11-20 18:03:21 +00:00
|
|
|
min.dim <- max.dim <- as.integer(k)
|
2019-09-16 09:15:51 +00:00
|
|
|
}
|
|
|
|
if (min.dim > max.dim) {
|
2019-09-25 11:53:45 +00:00
|
|
|
stop("'min.dim' bigger 'max.dim'.")
|
2019-09-16 09:15:51 +00:00
|
|
|
}
|
|
|
|
if (max.dim >= ncol(X)) {
|
2019-09-25 11:53:45 +00:00
|
|
|
stop("'max.dim' (or 'k') must be smaller than 'ncol(X)'.")
|
|
|
|
}
|
|
|
|
|
2019-10-18 07:06:36 +00:00
|
|
|
if (missing(h) || is.null(h)) {
|
|
|
|
estimate <- TRUE
|
|
|
|
} else if (is.function(h)) {
|
|
|
|
estimate <- TRUE
|
2019-09-25 11:53:45 +00:00
|
|
|
estimate.bandwidth <- h
|
2019-10-18 07:06:36 +00:00
|
|
|
} else if (is.numeric(h) && h > 0.0) {
|
|
|
|
estimate <- FALSE
|
|
|
|
h <- as.double(h)
|
|
|
|
} else {
|
|
|
|
stop("Bandwidth 'h' must be positive numeric.")
|
2019-09-25 11:53:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!is.numeric(tau) || length(tau) > 1L || tau <= 0.0) {
|
|
|
|
stop("Initial step-width 'tau' must be positive number.")
|
|
|
|
} else {
|
|
|
|
tau <- as.double(tau)
|
|
|
|
}
|
|
|
|
if (!is.numeric(tol) || length(tol) > 1L || tol < 0.0) {
|
|
|
|
stop("Break condition tolerance 'tol' must be not negative number.")
|
|
|
|
} else {
|
|
|
|
tol <- as.double(tol)
|
|
|
|
}
|
2019-11-20 18:03:21 +00:00
|
|
|
if (!is.numeric(slack) || length(slack) > 1L || slack < 0.0) {
|
|
|
|
stop("Break condition slack 'slack' must be not negative number.")
|
|
|
|
} else {
|
|
|
|
slack <- as.double(slack)
|
|
|
|
}
|
|
|
|
if (!is.numeric(gamma) || length(gamma) > 1L || gamma <= 0.0 || gamma >= 1.0) {
|
|
|
|
stop("Stepsize reduction 'gamma' must be between 0 and 1.")
|
|
|
|
} else {
|
|
|
|
gamma <- as.double(gamma)
|
|
|
|
}
|
2019-09-25 11:53:45 +00:00
|
|
|
|
2019-11-22 08:32:14 +00:00
|
|
|
if (!is.numeric(max.iter) || length(max.iter) > 1L) {
|
|
|
|
stop("Parameter 'max.iter' must be positive integer.")
|
|
|
|
} else if (!is.integer(max.iter)) {
|
|
|
|
max.iter <- as.integer(max.iter)
|
2019-09-25 11:53:45 +00:00
|
|
|
}
|
2019-11-22 08:32:14 +00:00
|
|
|
if (max.iter < 1L) {
|
|
|
|
stop("Parameter 'max.iter' must be at least 1L.")
|
2019-09-25 11:53:45 +00:00
|
|
|
}
|
2019-11-20 18:03:21 +00:00
|
|
|
|
|
|
|
if (is.null(V.init)) {
|
|
|
|
if (!is.numeric(attempts) || length(attempts) > 1L) {
|
|
|
|
stop("Parameter 'attempts' must be positive integer.")
|
|
|
|
} else if (!is.integer(attempts)) {
|
|
|
|
attempts <- as.integer(attempts)
|
|
|
|
}
|
|
|
|
if (attempts < 1L) {
|
|
|
|
stop("Parameter 'attempts' must be at least 1L.")
|
|
|
|
}
|
2019-09-25 11:53:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (is.function(logger)) {
|
|
|
|
loggerEnv <- environment(logger)
|
|
|
|
} else {
|
|
|
|
loggerEnv <- NULL
|
2019-09-16 09:15:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
# Call specified method.
|
|
|
|
method <- tolower(method)
|
|
|
|
call <- match.call()
|
|
|
|
dr <- list()
|
2019-11-20 18:03:21 +00:00
|
|
|
dr$res <- list()
|
2019-09-16 09:15:51 +00:00
|
|
|
for (k in min.dim:max.dim) {
|
2019-09-25 11:53:45 +00:00
|
|
|
|
2019-10-18 07:06:36 +00:00
|
|
|
if (estimate) {
|
2019-09-25 11:53:45 +00:00
|
|
|
h <- estimate.bandwidth(X, k, nObs)
|
|
|
|
}
|
|
|
|
|
2019-11-20 18:03:21 +00:00
|
|
|
dr.k <- .Call('cve', PACKAGE = 'CVE',
|
|
|
|
X, Y, k, h,
|
2019-11-25 19:49:43 +00:00
|
|
|
method_nr,
|
2019-11-20 18:03:21 +00:00
|
|
|
V.init,
|
|
|
|
momentum, tau, tol,
|
|
|
|
slack, gamma,
|
2019-11-22 08:32:14 +00:00
|
|
|
max.iter, attempts,
|
2019-11-20 18:03:21 +00:00
|
|
|
logger, loggerEnv)
|
|
|
|
|
2019-09-25 11:53:45 +00:00
|
|
|
dr.k$B <- null(dr.k$V)
|
|
|
|
dr.k$loss <- mean(dr.k$L)
|
|
|
|
dr.k$h <- h
|
2019-09-16 09:15:51 +00:00
|
|
|
dr.k$k <- k
|
|
|
|
class(dr.k) <- "cve.k"
|
2019-11-20 18:03:21 +00:00
|
|
|
dr$res[[as.character(k)]] <- dr.k
|
2019-09-16 09:15:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
# augment result information
|
2019-10-18 07:06:36 +00:00
|
|
|
dr$X <- X
|
|
|
|
dr$Y <- Y
|
2019-09-16 09:15:51 +00:00
|
|
|
dr$method <- method
|
|
|
|
dr$call <- call
|
|
|
|
class(dr) <- "cve"
|
|
|
|
return(dr)
|
|
|
|
}
|