#' Conditional Variance Estimator (CVE) Package. #' #' Conditional Variance Estimation (CVE) is a novel sufficient dimension #' reduction (SDR) method for regressions satisfying \eqn{E(Y|X) = E(Y|B'X)}, #' where \eqn{B'X} is a lower dimensional projection of the predictors. CVE, #' similarly to its main competitor, the mean average variance estimation #' (MAVE), is not based on inverse regression, and does not require the #' restrictive linearity and constant variance conditions of moment based SDR #' methods. CVE is data-driven and applies to additive error regressions with #' continuous predictors and link function. The effectiveness and accuracy of #' CVE compared to MAVE and other SDR techniques is demonstrated in simulation #' studies. CVE is shown to outperform MAVE in some model set-ups, while it #' remains largely on par under most others. #' Let \eqn{Y} be real denotes a univariate response and \eqn{X} a real #' \eqn{p}-dimensional covariate vector. We assume that the dependence of #' \eqn{Y} and \eqn{X} is modelled by #' \deqn{Y = g(B'X) + \epsilon} #' where \eqn{X} is independent of \eqn{\epsilon} with positive definite #' variance-covariance matrix \eqn{Var(X) = \Sigma_X}. \eqn{\epsilon} is a mean #' zero random variable with finite \eqn{Var(\epsilon) = E(\epsilon^2)}, \eqn{g} #' is an unknown, continuous non-constant function, #' and \eqn{B = (b_1, ..., b_k)} is #' a real \eqn{p \times k}{p x k} of rank \eqn{k <= p}{k \leq p}. #' Without loss of generality \eqn{B} is assumed to be orthonormal. #' #' @author Daniel Kapla, Lukas Fertl, Bura Efstathia #' @references Fertl Lukas, Bura Efstathia. (2019), Conditional Variance #' Estimation for Sufficient Dimension Reduction. Working Paper. #' #' @docType package #' @useDynLib CVE, .registration = TRUE "_PACKAGE" #' Conditional Variance Estimator (CVE). #' #' @inherit CVE-package description #' #' @param formula an object of class \code{"formula"} which is a symbolic #' description of the model to be fitted. #' @param data an optional data frame, containing the data for the formula if #' supplied. #' @param method specifies the CVE method variation as one of #' \itemize{ #' \item "simple" exact implementation as described in the paper listed #' below. #' \item "weighted" variation with addaptive weighting of slices. #' } #' @param ... Parameters passed on to \code{cve.call}. #' #' @return an S3 object of class \code{cve} with components: #' \describe{ #' \item{X}{Original training data,} #' \item{Y}{Responce of original training data,} #' \item{method}{Name of used method,} #' \item{call}{the matched call,} #' \item{res}{list of components \code{V, L, B, loss, h} and \code{k} for #' each \eqn{k=min.dim,...,max.dim} (dimension).} #' } #' #' @examples #' # create dataset #' x <- matrix(rnorm(400), 100, 4) #' y <- x[, 1] + x[, 2] + as.matrix(rnorm(100)) #' #' # Call CVE. #' dr <- cve(y ~ x) #' # Call weighted CVE. #' dr.weighted <- cve(y ~ x, method = "weighted") #' #' # Training data responces of reduced data. #' y.est <- directions(dr, 1) #' # Extract SDR subspace basis of dimension 1. #' B <- coef(dr.momentum, 1) #' #' @seealso For a detailed description of \code{formula} see #' \code{\link{formula}}. #' @references Fertl Lukas, Bura Efstathia. (2019), Conditional Variance #' Estimation for Sufficient Dimension Reduction. Working Paper. #' #' @importFrom stats model.frame #' @export cve <- function(formula, data, method = "simple", max.dim = 10L, ...) { # check for type of `data` if supplied and set default if (missing(data)) { data <- environment(formula) } else if (!is.data.frame(data)) { stop("Parameter 'data' must be a 'data.frame' or missing.") } # extract `X`, `Y` from `formula` with `data` model <- stats::model.frame(formula, data) X <- as.matrix(model[ ,-1L, drop = FALSE]) Y <- as.double(model[ , 1L]) # pass extracted data on to [cve.call()] dr <- cve.call(X, Y, method = method, max.dim = max.dim, ...) # overwrite `call` property from [cve.call()] dr$call <- match.call() return(dr) } #' @inherit cve title #' @inherit cve description #' #' @param nObs parameter for choosing bandwidth \code{h} using #' \code{\link{estimate.bandwidth}} (ignored if \code{h} is supplied). #' @param X data matrix with samples in its rows. #' @param Y Responses (1 dimensional). #' @param k Dimension of lower dimensional projection, if \code{k} is given #' only the specified dimension \code{B} matrix is estimated. #' @param min.dim lower bounds for \code{k}, (ignored if \code{k} is supplied). #' @param max.dim upper bounds for \code{k}, (ignored if \code{k} is supplied). #' @param tau Initial step-size. #' @param tol Tolerance for break condition. #' @param max.iter maximum number of optimization steps. #' @param attempts number of arbitrary different starting points. #' @param logger a logger function (only for advanced user, significantly slows #' down the computation). #' @param h bandwidth or function to estimate bandwidth, defaults to internaly #' estimated bandwidth. #' @param momentum number of [0, 1) giving the ration of momentum for eucledian #' gradient update with a momentum term. #' @param slack Positive scaling to allow small increases of the loss while #' optimizing. #' @param gamma step-size reduction multiple. #' @param V.init Semi-orthogonal matrix of dimensions `(ncol(X), ncol(X) - k)` #' as optimization starting value. (If supplied, \code{attempts} is #' set to 1 and \code{k} to match dimension) #' #' @inherit cve return #' #' @examples #' # Create a dataset (n samples): #' n <- 100 #' X <- matrix(rnorm(4 * n), n) #' Y <- matrix(X[, 1] + cos(X[, 2]) + rnorm(n, 0, .1), n) #' #' # Create logger function: #' logger <- function(attempt, iter, data) { #' if (iter == 0) { #' cat("Starting attempt nr:", attempt, "\n") #' } #' cat(" iter:", iter, "loss:", data$loss, "\n") #' } #' #' Call 'cve' with logger: #' cve(Y ~ X, logger = logger) #' #' @export cve.call <- function(X, Y, method = "simple", nObs = sqrt(nrow(X)), h = NULL, min.dim = 1L, max.dim = 10L, k = NULL, momentum = 0.0, tau = 1.0, tol = 1e-3, slack = 0.0, gamma = 0.5, V.init = NULL, max.iter = 50L, attempts = 10L, logger = NULL) { # Determine method with partial matching (shortcuts: "Weight" -> "weighted") methods <- list( "simple" = 0L, "weighted" = 1L ) method_nr <- methods[[tolower(method), exact = FALSE]] if (!is.integer(method_nr)) { stop('Unable to determine method.') } method <- names(which(method_nr == methods)) # parameter checking if (!is.numeric(momentum) || length(momentum) > 1L) { stop("Momentum must be a number.") } if (!is.double(momentum)) { momentum <- as.double(momentum) } if (momentum < 0.0 || momentum >= 1.0) { stop("Momentum must be in [0, 1).") } if (!(is.matrix(X) && is.numeric(X))) { stop("Parameter 'X' should be a numeric matrices.") } if (!is.numeric(Y)) { stop("Parameter 'Y' must be numeric.") } if (is.matrix(Y) || !is.double(Y)) { Y <- as.double(Y) } if (nrow(X) != length(Y)) { stop("Rows of 'X' and 'Y' elements are not compatible.") } if (ncol(X) < 2) { stop("'X' is one dimensional, no need for dimension reduction.") } if (!is.null(V.init)) { if (!is.matrix(V.init)) { stop("'V.init' must be a matrix.") } if (!all.equal(crossprod(V.init), diag(1, ncol(V.init)))) { stop("'V.init' must be semi-orthogonal.") } if (ncol(X) != nrow(V.init) || ncol(X) <= ncol(V.init)) { stop("Dimension missmatch of 'V.init' and 'X'") } min.dim <- max.dim <- ncol(X) - ncol(V.init) attempts <- 0L } else if (missing(k) || is.null(k)) { min.dim <- as.integer(min.dim) max.dim <- as.integer(min(max.dim, ncol(X) - 1L)) } else { min.dim <- max.dim <- as.integer(k) } if (min.dim > max.dim) { stop("'min.dim' bigger 'max.dim'.") } if (max.dim >= ncol(X)) { stop("'max.dim' (or 'k') must be smaller than 'ncol(X)'.") } if (missing(h) || is.null(h)) { estimate <- TRUE } else if (is.function(h)) { estimate <- TRUE estimate.bandwidth <- h } else if (is.numeric(h) && h > 0.0) { estimate <- FALSE h <- as.double(h) } else { stop("Bandwidth 'h' must be positive numeric.") } if (!is.numeric(tau) || length(tau) > 1L || tau <= 0.0) { stop("Initial step-width 'tau' must be positive number.") } else { tau <- as.double(tau) } if (!is.numeric(tol) || length(tol) > 1L || tol < 0.0) { stop("Break condition tolerance 'tol' must be not negative number.") } else { tol <- as.double(tol) } if (!is.numeric(slack) || length(slack) > 1L || slack < 0.0) { stop("Break condition slack 'slack' must be not negative number.") } else { slack <- as.double(slack) } if (!is.numeric(gamma) || length(gamma) > 1L || gamma <= 0.0 || gamma >= 1.0) { stop("Stepsize reduction 'gamma' must be between 0 and 1.") } else { gamma <- as.double(gamma) } if (!is.numeric(max.iter) || length(max.iter) > 1L) { stop("Parameter 'max.iter' must be positive integer.") } else if (!is.integer(max.iter)) { max.iter <- as.integer(max.iter) } if (max.iter < 1L) { stop("Parameter 'max.iter' must be at least 1L.") } if (is.null(V.init)) { if (!is.numeric(attempts) || length(attempts) > 1L) { stop("Parameter 'attempts' must be positive integer.") } else if (!is.integer(attempts)) { attempts <- as.integer(attempts) } if (attempts < 1L) { stop("Parameter 'attempts' must be at least 1L.") } } if (is.function(logger)) { loggerEnv <- environment(logger) } else { loggerEnv <- NULL } # Call specified method. method <- tolower(method) call <- match.call() dr <- list() dr$res <- list() for (k in min.dim:max.dim) { if (estimate) { h <- estimate.bandwidth(X, k, nObs) } dr.k <- .Call('cve', PACKAGE = 'CVE', X, Y, k, h, method_nr, V.init, momentum, tau, tol, slack, gamma, max.iter, attempts, logger, loggerEnv) dr.k$B <- null(dr.k$V) dr.k$loss <- mean(dr.k$L) dr.k$h <- h dr.k$k <- k class(dr.k) <- "cve.k" dr$res[[as.character(k)]] <- dr.k } # augment result information dr$X <- X dr$Y <- Y dr$method <- method dr$call <- call class(dr) <- "cve" return(dr) }