#' Conditional Variance Estimator (CVE) Package. #' #' Conditional Variance Estimation (CVE) is a novel sufficient dimension #' reduction (SDR) method for regressions satisfying \eqn{E(Y|X) = E(Y|B'X)}, #' where \eqn{B'X} is a lower dimensional projection of the predictors. CVE, #' similarly to its main competitor, the mean average variance estimation #' (MAVE), is not based on inverse regression, and does not require the #' restrictive linearity and constant variance conditions of moment based SDR #' methods. CVE is data-driven and applies to additive error regressions with #' continuous predictors and link function. The effectiveness and accuracy of #' CVE compared to MAVE and other SDR techniques is demonstrated in simulation #' studies. CVE is shown to outperform MAVE in some model set-ups, while it #' remains largely on par under most others. #' #' Let \eqn{Y} be real denotes a univariate response and \eqn{X} a real #' \eqn{p}-dimensional covariate vector. We assume that the dependence of #' \eqn{Y} and \eqn{X} is modelled by #' \deqn{Y = g(B'X) + \epsilon} #' where \eqn{X} is independent of \eqn{\epsilon} with positive definite #' variance-covariance matrix \eqn{Var(X) = \Sigma_X}. \eqn{\epsilon} is a mean #' zero random variable with finite \eqn{Var(\epsilon) = E(\epsilon^2)}, \eqn{g} #' is an unknown, continuous non-constant function, #' and \eqn{B = (b_1, ..., b_k)} is #' a real \eqn{p \times k}{p x k} of rank \eqn{k <= p}{k \leq p}. #' Without loss of generality \eqn{B} is assumed to be orthonormal. #' #' @author Daniel Kapla, Lukas Fertl, Bura Efstathia #' @references Fertl Lukas, Bura Efstathia. Conditional Variance Estimation for #' Sufficient Dimension Reduction, 2019 #' #' @importFrom stats model.frame #' @docType package #' @useDynLib CVE, .registration = TRUE "_PACKAGE" #' Conditional Variance Estimator (CVE). #' #' TODO: reuse of package description and details!!!! #' #' @param formula an object of class \code{"formula"} which is a symbolic #' description of the model to be fitted. #' @param data an optional data frame, containing the data for the formula if #' supplied. #' @param method specifies the CVE method variation as one of #' \itemize{ #' \item "simple" exact implementation as describet in the paper listed #' below. #' \item "weighted" variation with addaptive weighting of slices. #' } #' @param ... Parameters passed on to \code{cve.call}. #' @examples #' library(CVE) #' #' # create dataset #' n <- 200 #' p <- 12 #' X <- matrix(rnorm(n * p), n, p) #' B <- cbind(c(1, rep(0, p - 1)), c(0, 1, rep(0, p - 2))) #' Y <- X %*% B #' Y <- Y[, 1L]^2 + Y[, 2L]^2 + rnorm(n, 0, 0.3) #' #' # Call the CVE method. #' dr <- cve(Y ~ X) #' round(dr[[2]]$B, 1) #' #' @seealso For a detailed description of the formula parameter see #' [\code{\link{formula}}]. #' @export cve <- function(formula, data, method = "simple", max.dim = 10L, ...) { # check for type of `data` if supplied and set default if (missing(data)) { data <- environment(formula) } else if (!is.data.frame(data)) { stop("Parameter 'data' must be a 'data.frame' or missing.") } # extract `X`, `Y` from `formula` with `data` model <- stats::model.frame(formula, data) X <- as.matrix(model[ ,-1L, drop = FALSE]) Y <- as.double(model[ , 1L]) # pass extracted data on to [cve.call()] dr <- cve.call(X, Y, method = method, max.dim = max.dim, ...) # overwrite `call` property from [cve.call()] dr$call <- match.call() return(dr) } #' @param nObs parameter for choosing bandwidth \code{h} using #' \code{\link{estimate.bandwidth}} (ignored if \code{h} is supplied). #' @param X data matrix with samples in its rows. #' @param Y Responces (1 dimensional). #' @param k Dimension of lower dimensional projection, if given only the #' specified dimension is estimated. #' @param min.dim lower bounds for \code{k}, (ignored if \code{k} is supplied). #' @param max.dim upper bounds for \code{k}, (ignored if \code{k} is supplied). #' @param tau Initial step-size. #' @param tol Tolerance for break condition. #' @param epochs maximum number of optimization steps. #' @param attempts number of arbitrary different starting points. #' @param logger a logger function (only for addvanced user). #' @rdname cve #' @export cve.call <- function(X, Y, method = "simple", nObs = sqrt(nrow(X)), h = NULL, min.dim = 1L, max.dim = 10L, k = NULL, tau = 1.0, tol = 1e-3, epochs = 50L, attempts = 10L, logger = NULL) { # parameter checking if (!(is.matrix(X) && is.numeric(X))) { stop("Parameter 'X' should be a numeric matrices.") } if (!is.numeric(Y)) { stop("Parameter 'Y' must be numeric.") } if (is.matrix(Y) || !is.double(Y)) { Y <- as.double(Y) } if (nrow(X) != length(Y)) { stop("Rows of 'X' and 'Y' elements are not compatible.") } if (ncol(X) < 2) { stop("'X' is one dimensional, no need for dimension reduction.") } if (missing(k) || is.null(k)) { min.dim <- as.integer(min.dim) max.dim <- as.integer(min(max.dim, ncol(X) - 1L)) } else { min.dim <- as.integer(k) max.dim <- as.integer(k) } if (min.dim > max.dim) { stop("'min.dim' bigger 'max.dim'.") } if (max.dim >= ncol(X)) { stop("'max.dim' (or 'k') must be smaller than 'ncol(X)'.") } if (missing(h) || is.null(h)) { estimate <- TRUE } else if (is.function(h)) { estimate <- TRUE estimate.bandwidth <- h } else if (is.numeric(h) && h > 0.0) { estimate <- FALSE h <- as.double(h) } else { stop("Bandwidth 'h' must be positive numeric.") } if (!is.numeric(tau) || length(tau) > 1L || tau <= 0.0) { stop("Initial step-width 'tau' must be positive number.") } else { tau <- as.double(tau) } if (!is.numeric(tol) || length(tol) > 1L || tol < 0.0) { stop("Break condition tolerance 'tol' must be not negative number.") } else { tol <- as.double(tol) } if (!is.numeric(epochs) || length(epochs) > 1L) { stop("Parameter 'epochs' must be positive integer.") } else if (!is.integer(epochs)) { epochs <- as.integer(epochs) } if (epochs < 1L) { stop("Parameter 'epochs' must be at least 1L.") } if (!is.numeric(attempts) || length(attempts) > 1L) { stop("Parameter 'attempts' must be positive integer.") } else if (!is.integer(attempts)) { attempts <- as.integer(attempts) } if (attempts < 1L) { stop("Parameter 'attempts' must be at least 1L.") } if (is.function(logger)) { loggerEnv <- environment(logger) } else { loggerEnv <- NULL } # Call specified method. method <- tolower(method) call <- match.call() dr <- list() for (k in min.dim:max.dim) { if (estimate) { h <- estimate.bandwidth(X, k, nObs) } if (method == 'simple') { dr.k <- .Call('cve_simple', PACKAGE = 'CVE', X, Y, k, h, tau, tol, epochs, attempts, logger, loggerEnv) # dr.k <- cve_simple(X, Y, k, nObs = nObs, ...) # } else if (method == 'linesearch') { # dr.k <- cve_linesearch(X, Y, k, nObs = nObs, ...) # } else if (method == 'rcg') { # dr.k <- cve_rcg(X, Y, k, nObs = nObs, ...) # } else if (method == 'momentum') { # dr.k <- cve_momentum(X, Y, k, nObs = nObs, ...) # } else if (method == 'rmsprob') { # dr.k <- cve_rmsprob(X, Y, k, nObs = nObs, ...) # } else if (method == 'sgdrmsprob') { # dr.k <- cve_sgdrmsprob(X, Y, k, nObs = nObs, ...) # } else if (method == 'sgd') { # dr.k <- cve_sgd(X, Y, k, nObs = nObs, ...) } else { stop('Got unknown method.') } dr.k$B <- null(dr.k$V) dr.k$loss <- mean(dr.k$L) dr.k$h <- h dr.k$k <- k class(dr.k) <- "cve.k" dr[[k]] <- dr.k } # augment result information dr$X <- X dr$Y <- Y dr$method <- method dr$call <- call class(dr) <- "cve" return(dr) } #' Loss distribution kink plot. #' #' @param x Object of class \code{"cve"} (result of [\code{\link{cve}}]). #' @param ... Pass through parameters to [\code{\link{plot}}] and #' [\code{\link{lines}}] #' #' @seealso see \code{\link{par}} for graphical parameters to pass through #' as well as \code{\link{plot}}, the standard plot utility. #' @importFrom graphics plot lines points #' @method plot cve #' @export plot.cve <- function(x, ...) { L <- c() k <- c() for (dr.k in x) { if (class(dr.k) == 'cve.k') { k <- c(k, paste0(dr.k$k)) L <- c(L, dr.k$L) } } L <- matrix(L, ncol = length(k)) / var(x$Y) boxplot(L, main = "Kink plot", xlab = "SDR dimension", ylab = "Sample loss distribution", names = k) # lines(apply(L, 2, mean)) # TODO: ? } #' Prints a summary of a \code{cve} result. #' @param object Instance of 'cve' as return of \code{cve}. #' @method summary cve #' @export summary.cve <- function(object, ...) { cat('Summary of CVE result - Method: "', object$method, '"\n', '\n', 'Dataset size: ', nrow(object$X), '\n', 'Data Dimension: ', ncol(object$X), '\n', 'SDR Dimension: ', object$k, '\n', 'loss: ', object$loss, '\n', '\n', 'Called via:\n', ' ', sep='') print(object$call) }