#' Conditional Variance Estimator (CVE) Package. #' #' Conditional Variance Estimation (CVE) is a novel sufficient dimension #' reduction (SDR) method for regressions satisfying \eqn{E(Y|X) = E(Y|B'X)}, #' where \eqn{B'X} is a lower dimensional projection of the predictors. CVE, #' similarly to its main competitor, the mean average variance estimation #' (MAVE), is not based on inverse regression, and does not require the #' restrictive linearity and constant variance conditions of moment based SDR #' methods. CVE is data-driven and applies to additive error regressions with #' continuous predictors and link function. The effectiveness and accuracy of #' CVE compared to MAVE and other SDR techniques is demonstrated in simulation #' studies. CVE is shown to outperform MAVE in some model set-ups, while it #' remains largely on par under most others. #' #' Let \eqn{Y} be real denotes a univariate response and \eqn{X} a real #' \eqn{p}-dimensional covariate vector. We assume that the dependence of #' \eqn{Y} and \eqn{X} is modelled by #' \deqn{Y = g(B'X) + \epsilon} #' where \eqn{X} is independent of \eqn{\epsilon} with positive definite #' variance-covariance matrix \eqn{Var(X) = \Sigma_X}. \eqn{\epsilon} is a mean #' zero random variable with finite \eqn{Var(\epsilon) = E(\epsilon^2)}, \eqn{g} #' is an unknown, continuous non-constant function, #' and \eqn{B = (b_1, ..., b_k)} is #' a real \eqn{p \times k}{p x k} of rank \eqn{k <= p}{k \leq p}. #' Without loss of generality \eqn{B} is assumed to be orthonormal. #' #' @author Daniel Kapla, Lukas Fertl, Bura Efstathia #' @references Fertl Lukas, Bura Efstathia. Conditional Variance Estimation for #' Sufficient Dimension Reduction, 2019 #' #' @importFrom stats model.frame #' @docType package #' @useDynLib CVE, .registration = TRUE "_PACKAGE" #' Conditional Variance Estimator (CVE). #' #' TODO: reuse of package description and details!!!! #' #' @param formula an object of class \code{"formula"} which is a symbolic #' description of the model to be fitted. #' @param data an optional data frame, containing the data for the formula if #' supplied. #' @param method specifies the CVE method variation as one of #' \itemize{ #' \item "simple" exact implementation as described in the paper listed #' below. #' \item "weighted" variation with addaptive weighting of slices. #' } #' @param ... Parameters passed on to \code{cve.call}. #' #' @return dr is a S3 object of class \code{cve} with named properties: #' \itemize{ #' \item X: Original training data, #' \item Y: Responce of original training data, #' \item method: Name of used method, #' \item call: The method call #' } #' as well as indexed entries \code{dr[[k]]} storing the k-dimensional SDR #' projection matrices. #' #' @examples #' library(CVE) #' #' # create dataset #' n <- 200 #' p <- 12 #' X <- matrix(rnorm(n * p), n, p) #' B <- cbind(c(1, rep(0, p - 1)), c(0, 1, rep(0, p - 2))) #' Y <- X %*% B #' Y <- Y[, 1L]^2 + Y[, 2L]^2 + rnorm(n, 0, 0.3) #' #' # Call the CVE method. #' dr <- cve(Y ~ X) #' (B <- basis(dr, 2)) #' #' @seealso For a detailed description of \code{formula} see #' [\code{\link{formula}}]. #' @export cve <- function(formula, data, method = "simple", max.dim = 10L, ...) { # check for type of `data` if supplied and set default if (missing(data)) { data <- environment(formula) } else if (!is.data.frame(data)) { stop("Parameter 'data' must be a 'data.frame' or missing.") } # extract `X`, `Y` from `formula` with `data` model <- stats::model.frame(formula, data) X <- as.matrix(model[ ,-1L, drop = FALSE]) Y <- as.double(model[ , 1L]) # pass extracted data on to [cve.call()] dr <- cve.call(X, Y, method = method, max.dim = max.dim, ...) # overwrite `call` property from [cve.call()] dr$call <- match.call() return(dr) } #' @param nObs parameter for choosing bandwidth \code{h} using #' \code{\link{estimate.bandwidth}} (ignored if \code{h} is supplied). #' @param X data matrix with samples in its rows. #' @param Y Responses (1 dimensional). #' @param k Dimension of lower dimensional projection, if \code{k} is given #' only the specified dimension \code{B} matrix is estimated. #' @param min.dim lower bounds for \code{k}, (ignored if \code{k} is supplied). #' @param max.dim upper bounds for \code{k}, (ignored if \code{k} is supplied). #' @param tau Initial step-size. #' @param tol Tolerance for break condition. #' @param epochs maximum number of optimization steps. #' @param attempts number of arbitrary different starting points. #' @param logger a logger function (only for advanced user, significantly slows #' down the computation). #' #' @return dr is a list which contains: #' \itemize{ #' \item dir: dir[[d]] is the central space with d-dimension #' d = 1, 2, ..., p reduced direction of different dimensions #' \item y: the value of response #' \item idx: the index of variables which survives after screening #' \item max.dim: the largest dimensions of CS or CMS which have been calculated in mave function #' \item ky: parameter used for DIM for selection #' \item x: the original training data #' } #' #' @rdname cve #' @export cve.call <- function(X, Y, method = "simple", nObs = sqrt(nrow(X)), h = NULL, min.dim = 1L, max.dim = 10L, k = NULL, momentum = 0.0, tau = 1.0, tol = 1e-3, slack = 0.0, gamma = 0.5, V.init = NULL, epochs = 50L, attempts = 10L, logger = NULL) { # get method bitmask methods <- list( "simple" = 0L, "weighted" = 1L ) method <- tolower(method) if (!(method %in% names(methods))) { stop('Got unknown method.') } method_bitmask <- methods[[method]] # parameter checking if (!is.numeric(momentum) || length(momentum) > 1L) { stop("Momentum must be a number.") } if (!is.double(momentum)) { momentum <- as.double(momentum) } if (momentum < 0.0 || momentum >= 1.0) { stop("Momentum must be in [0, 1).") } if (!(is.matrix(X) && is.numeric(X))) { stop("Parameter 'X' should be a numeric matrices.") } if (!is.numeric(Y)) { stop("Parameter 'Y' must be numeric.") } if (is.matrix(Y) || !is.double(Y)) { Y <- as.double(Y) } if (nrow(X) != length(Y)) { stop("Rows of 'X' and 'Y' elements are not compatible.") } if (ncol(X) < 2) { stop("'X' is one dimensional, no need for dimension reduction.") } if (!is.null(V.init)) { if (!is.matrix(V.init)) { stop("'V.init' must be a matrix.") } if (!all.equal(crossprod(V.init), diag(1, ncol(V.init)))) { stop("'V.init' must be semi-orthogonal.") } if (ncol(X) != nrow(V.init) || ncol(X) <= ncol(V.init)) { stop("Dimension missmatch of 'V.init' and 'X'") } min.dim <- max.dim <- ncol(X) - ncol(V.init) attempts <- 0L } else if (missing(k) || is.null(k)) { min.dim <- as.integer(min.dim) max.dim <- as.integer(min(max.dim, ncol(X) - 1L)) } else { min.dim <- max.dim <- as.integer(k) } if (min.dim > max.dim) { stop("'min.dim' bigger 'max.dim'.") } if (max.dim >= ncol(X)) { stop("'max.dim' (or 'k') must be smaller than 'ncol(X)'.") } if (missing(h) || is.null(h)) { estimate <- TRUE } else if (is.function(h)) { estimate <- TRUE estimate.bandwidth <- h } else if (is.numeric(h) && h > 0.0) { estimate <- FALSE h <- as.double(h) } else { stop("Bandwidth 'h' must be positive numeric.") } if (!is.numeric(tau) || length(tau) > 1L || tau <= 0.0) { stop("Initial step-width 'tau' must be positive number.") } else { tau <- as.double(tau) } if (!is.numeric(tol) || length(tol) > 1L || tol < 0.0) { stop("Break condition tolerance 'tol' must be not negative number.") } else { tol <- as.double(tol) } if (!is.numeric(slack) || length(slack) > 1L || slack < 0.0) { stop("Break condition slack 'slack' must be not negative number.") } else { slack <- as.double(slack) } if (!is.numeric(gamma) || length(gamma) > 1L || gamma <= 0.0 || gamma >= 1.0) { stop("Stepsize reduction 'gamma' must be between 0 and 1.") } else { gamma <- as.double(gamma) } if (!is.numeric(epochs) || length(epochs) > 1L) { stop("Parameter 'epochs' must be positive integer.") } else if (!is.integer(epochs)) { epochs <- as.integer(epochs) } if (epochs < 1L) { stop("Parameter 'epochs' must be at least 1L.") } if (is.null(V.init)) { if (!is.numeric(attempts) || length(attempts) > 1L) { stop("Parameter 'attempts' must be positive integer.") } else if (!is.integer(attempts)) { attempts <- as.integer(attempts) } if (attempts < 1L) { stop("Parameter 'attempts' must be at least 1L.") } } if (is.function(logger)) { loggerEnv <- environment(logger) } else { loggerEnv <- NULL } # Call specified method. method <- tolower(method) call <- match.call() dr <- list() dr$res <- list() for (k in min.dim:max.dim) { if (estimate) { h <- estimate.bandwidth(X, k, nObs) } dr.k <- .Call('cve', PACKAGE = 'CVE', X, Y, k, h, method_bitmask, V.init, momentum, tau, tol, slack, gamma, epochs, attempts, logger, loggerEnv) dr.k$B <- null(dr.k$V) dr.k$loss <- mean(dr.k$L) dr.k$h <- h dr.k$k <- k class(dr.k) <- "cve.k" dr$res[[as.character(k)]] <- dr.k } # augment result information dr$X <- X dr$Y <- Y dr$method <- method dr$call <- call class(dr) <- "cve" return(dr) } #' Loss distribution elbow plot. #' #' Boxplots of the loss from \code{min.dim} to \code{max.dim} \code{k} values. #' #' @param x Object of class \code{"cve"} (result of [\code{\link{cve}}]). #' @param ... Pass through parameters to [\code{\link{plot}}] and #' [\code{\link{lines}}] #' #' @seealso see \code{\link{par}} for graphical parameters to pass through #' as well as \code{\link{plot}}, the standard plot utility. #' @method plot cve #' @importFrom graphics plot lines points #' @export plot.cve <- function(x, ...) { L <- c() k <- c() for (dr.k in x$res) { if (class(dr.k) == 'cve.k') { k <- c(k, as.character(dr.k$k)) L <- c(L, dr.k$L) } } L <- matrix(L, ncol = length(k)) / var(x$Y) boxplot(L, main = "elbow plot", xlab = "SDR dimension", ylab = "Sample loss distribution", names = k) } #' Prints a summary of a \code{cve} result. #' @param object Instance of 'cve' as returned by \code{cve}. #' @method summary cve #' @export summary.cve <- function(object, ...) { cat('Summary of CVE result - Method: "', object$method, '"\n', '\n', 'Dataset size: ', nrow(object$X), '\n', 'Data Dimension: ', ncol(object$X), '\n', # 'SDR Dimension: ', object$k, '\n', # 'loss: ', object$loss, '\n', '\n', 'Called via:\n', ' ', sep='') print(object$call) L <- c() k <- c() for (dr.k in object$res) { if (class(dr.k) == 'cve.k') { k <- c(k, as.character(dr.k$k)) L <- c(L, dr.k$L) } } L <- matrix(L, ncol = length(k)) S <- apply(L, 2, summary) colnames(S) <- k cat('\n') print(S) } #' @export directions <- function(dr, k) { UseMethod("directions") } #' Computes projected training data \code{X} for given dimension `k`. #' #' @param dr Instance of 'cve' as returned by \code{cve}. #' @param k SDR dimension to use for projection. #' #' @method directions cve #' @aliases directions directions.cve #' @export directions.cve <- function(dr, k) { if (!(k %in% names(dr$res))) { stop("SDR directions for requested dimension `k` not computed.") } return(dr$X %*% dr$res[[as.character(k)]]$B) } #' @export basis <- function(dr, k) { UseMethod("basis") } #' Gets estimated SDR basis. #' #' @param dr Instance of 'cve' as returned by \code{cve}. #' @param k SDR dimension of requested basis, if not given a list of all #' computed basis is returned. #' #' @return List of basis matrices, or the SDR basis for supplied dimension `k`. #' #' @method basis cve #' @aliases basis basis.cve #' @export basis.cve <- function(dr, k) { if (missing(k)) { Bs <- list() for (k in names(dr$res)) { Bs[[k]] <- dr$res[[k]]$B } return(Bs) } else if (k %in% names(dr$res)) { return(dr$res[[as.character(k)]]$B) } else { stop("Requested dimenion `k` not computed.") } } #' Predict method for CVE Fits. #' #' Predict responces using reduced data with \code{\link{mars}}. #' #' @param object instance of class \code{cve} (result of \code{cve}, #' \code{cve.call}). #' @param X.new Matrix of the new data to be predicted. #' @param dim dimension of SDR space to be used for data projecition. #' @param ... further arguments passed to \code{\link{mars}}. #' #' @return prediced response of data \code{X.new}. #' #' @seealso \code{\link{cve}}, \code{\link{cve.call}} or \pkg{\link{mars}}. #' #' @examples #' TODO: #' #' @aliases predict.cve #' @rdname predict.cve #' #' @method predict cve #' @export predict.cve <- function(object, X.new, dim = NULL, ...) { library(mda) if (!is.matrix(X.new)) { X.new <- matrix(X.new, nrow = 1L) } B <- dr$res[[as.character(dim)]]$B model <- mars(object$X %*% B, object$Y) predict(model, X.new %*% B) } #' @export predict.dim <- function(dr) { UseMethod("predict.dim") } #' @method predict.dim cve #' @export predict.dim.cve <- function(dr) { library(mda) # Get centered training data and dimensions X <- scale(dr$X, center = TRUE, scale = FALSE) n <- nrow(dr$X) # umber of training data samples Sigma <- (1 / n) * crossprod(X, X) eig <- eigen(Sigma) Sigma_root <- eig$vectors %*% tcrossprod(diag(sqrt(eig$values)), eig$vectors) X <- X %*% solve(Sigma_root) pred <- matrix(0, n, length(dr$res)) colnames(pred) <- names(dr$res) for (dr.k in dr$res) { # get "name" of current dimension k <- as.character(dr.k$k) # Project dataset with current SDR basis X.proj <- X %*% dr.k$B for (i in 1:n) { model <- mars(X.proj[-i, ], dr$Y[-i]) pred[i, k] <- predict(model, X.proj[i, , drop = F]) } } MSE <- colMeans((pred - dr$Y)^2) return(list( MSE = MSE, k = as.integer(names(which.min(MSE))) )) }