2
0
Fork 0
CVE/CVE_C/R/CVE.R

283 lines
9.6 KiB
R

#' Conditional Variance Estimator (CVE) Package.
#'
#' Conditional Variance Estimation (CVE) is a novel sufficient dimension
#' reduction (SDR) method for regressions satisfying \eqn{E(Y|X) = E(Y|B'X)},
#' where \eqn{B'X} is a lower dimensional projection of the predictors. CVE,
#' similarly to its main competitor, the mean average variance estimation
#' (MAVE), is not based on inverse regression, and does not require the
#' restrictive linearity and constant variance conditions of moment based SDR
#' methods. CVE is data-driven and applies to additive error regressions with
#' continuous predictors and link function. The effectiveness and accuracy of
#' CVE compared to MAVE and other SDR techniques is demonstrated in simulation
#' studies. CVE is shown to outperform MAVE in some model set-ups, while it
#' remains largely on par under most others.
#'
#' Let \eqn{Y} be real denotes a univariate response and \eqn{X} a real
#' \eqn{p}-dimensional covariate vector. We assume that the dependence of
#' \eqn{Y} and \eqn{X} is modelled by
#' \deqn{Y = g(B'X) + \epsilon}
#' where \eqn{X} is independent of \eqn{\epsilon} with positive definite
#' variance-covariance matrix \eqn{Var(X) = \Sigma_X}. \eqn{\epsilon} is a mean
#' zero random variable with finite \eqn{Var(\epsilon) = E(\epsilon^2)}, \eqn{g}
#' is an unknown, continuous non-constant function,
#' and \eqn{B = (b_1, ..., b_k)} is
#' a real \eqn{p \times k}{p x k} of rank \eqn{k <= p}{k \leq p}.
#' Without loss of generality \eqn{B} is assumed to be orthonormal.
#'
#' @author Daniel Kapla, Lukas Fertl, Bura Efstathia
#' @references Fertl Lukas, Bura Efstathia. Conditional Variance Estimation for
#' Sufficient Dimension Reduction, 2019
#'
#' @importFrom stats model.frame
#' @docType package
#' @useDynLib CVE, .registration = TRUE
"_PACKAGE"
#' Conditional Variance Estimator (CVE).
#'
#' TODO: reuse of package description and details!!!!
#'
#' @param formula an object of class \code{"formula"} which is a symbolic
#' description of the model to be fitted.
#' @param data an optional data frame, containing the data for the formula if
#' supplied.
#' @param method specifies the CVE method variation as one of
#' \itemize{
#' \item "simple" exact implementation as describet in the paper listed
#' below.
#' \item "weighted" variation with addaptive weighting of slices.
#' }
#' @param ... Parameters passed on to \code{cve.call}.
#' @examples
#' library(CVE)
#'
#' # create dataset
#' n <- 200
#' p <- 12
#' X <- matrix(rnorm(n * p), n, p)
#' B <- cbind(c(1, rep(0, p - 1)), c(0, 1, rep(0, p - 2)))
#' Y <- X %*% B
#' Y <- Y[, 1L]^2 + Y[, 2L]^2 + rnorm(n, 0, 0.3)
#'
#' # Call the CVE method.
#' dr <- cve(Y ~ X)
#' round(dr[[2]]$B, 1)
#'
#' @seealso For a detailed description of the formula parameter see
#' [\code{\link{formula}}].
#' @export
cve <- function(formula, data, method = "simple", max.dim = 10L, ...) {
# check for type of `data` if supplied and set default
if (missing(data)) {
data <- environment(formula)
} else if (!is.data.frame(data)) {
stop("Parameter 'data' must be a 'data.frame' or missing.")
}
# extract `X`, `Y` from `formula` with `data`
model <- stats::model.frame(formula, data)
X <- as.matrix(model[ ,-1L, drop = FALSE])
Y <- as.double(model[ , 1L])
# pass extracted data on to [cve.call()]
dr <- cve.call(X, Y, method = method, max.dim = max.dim, ...)
# overwrite `call` property from [cve.call()]
dr$call <- match.call()
return(dr)
}
#' @param nObs parameter for choosing bandwidth \code{h} using
#' \code{\link{estimate.bandwidth}} (ignored if \code{h} is supplied).
#' @param X data matrix with samples in its rows.
#' @param Y Responces (1 dimensional).
#' @param k Dimension of lower dimensional projection, if given only the
#' specified dimension is estimated.
#' @param min.dim lower bounds for \code{k}, (ignored if \code{k} is supplied).
#' @param max.dim upper bounds for \code{k}, (ignored if \code{k} is supplied).
#' @param tau Initial step-size.
#' @param tol Tolerance for break condition.
#' @param epochs maximum number of optimization steps.
#' @param attempts number of arbitrary different starting points.
#' @param logger a logger function (only for addvanced user).
#' @rdname cve
#' @export
cve.call <- function(X, Y, method = "simple",
nObs = sqrt(nrow(X)), h = NULL,
min.dim = 1L, max.dim = 10L, k = NULL,
tau = 1.0, tol = 1e-3,
epochs = 50L, attempts = 10L,
logger = NULL) {
# parameter checking
if (!(is.matrix(X) && is.numeric(X))) {
stop("Parameter 'X' should be a numeric matrices.")
}
if (!is.numeric(Y)) {
stop("Parameter 'Y' must be numeric.")
}
if (is.matrix(Y) || !is.double(Y)) {
Y <- as.double(Y)
}
if (nrow(X) != length(Y)) {
stop("Rows of 'X' and 'Y' elements are not compatible.")
}
if (ncol(X) < 2) {
stop("'X' is one dimensional, no need for dimension reduction.")
}
if (missing(k) || is.null(k)) {
min.dim <- as.integer(min.dim)
max.dim <- as.integer(min(max.dim, ncol(X) - 1L))
} else {
min.dim <- as.integer(k)
max.dim <- as.integer(k)
}
if (min.dim > max.dim) {
stop("'min.dim' bigger 'max.dim'.")
}
if (max.dim >= ncol(X)) {
stop("'max.dim' (or 'k') must be smaller than 'ncol(X)'.")
}
if (missing(h) || is.null(h)) {
estimate <- TRUE
} else if (is.function(h)) {
estimate <- TRUE
estimate.bandwidth <- h
} else if (is.numeric(h) && h > 0.0) {
estimate <- FALSE
h <- as.double(h)
} else {
stop("Bandwidth 'h' must be positive numeric.")
}
if (!is.numeric(tau) || length(tau) > 1L || tau <= 0.0) {
stop("Initial step-width 'tau' must be positive number.")
} else {
tau <- as.double(tau)
}
if (!is.numeric(tol) || length(tol) > 1L || tol < 0.0) {
stop("Break condition tolerance 'tol' must be not negative number.")
} else {
tol <- as.double(tol)
}
if (!is.numeric(epochs) || length(epochs) > 1L) {
stop("Parameter 'epochs' must be positive integer.")
} else if (!is.integer(epochs)) {
epochs <- as.integer(epochs)
}
if (epochs < 1L) {
stop("Parameter 'epochs' must be at least 1L.")
}
if (!is.numeric(attempts) || length(attempts) > 1L) {
stop("Parameter 'attempts' must be positive integer.")
} else if (!is.integer(attempts)) {
attempts <- as.integer(attempts)
}
if (attempts < 1L) {
stop("Parameter 'attempts' must be at least 1L.")
}
if (is.function(logger)) {
loggerEnv <- environment(logger)
} else {
loggerEnv <- NULL
}
# Call specified method.
method <- tolower(method)
call <- match.call()
dr <- list()
for (k in min.dim:max.dim) {
if (estimate) {
h <- estimate.bandwidth(X, k, nObs)
}
if (method == 'simple') {
dr.k <- .Call('cve_simple', PACKAGE = 'CVE',
X, Y, k, h,
tau, tol,
epochs, attempts,
logger, loggerEnv)
# dr.k <- cve_simple(X, Y, k, nObs = nObs, ...)
# } else if (method == 'linesearch') {
# dr.k <- cve_linesearch(X, Y, k, nObs = nObs, ...)
# } else if (method == 'rcg') {
# dr.k <- cve_rcg(X, Y, k, nObs = nObs, ...)
# } else if (method == 'momentum') {
# dr.k <- cve_momentum(X, Y, k, nObs = nObs, ...)
# } else if (method == 'rmsprob') {
# dr.k <- cve_rmsprob(X, Y, k, nObs = nObs, ...)
# } else if (method == 'sgdrmsprob') {
# dr.k <- cve_sgdrmsprob(X, Y, k, nObs = nObs, ...)
# } else if (method == 'sgd') {
# dr.k <- cve_sgd(X, Y, k, nObs = nObs, ...)
} else {
stop('Got unknown method.')
}
dr.k$B <- null(dr.k$V)
dr.k$loss <- mean(dr.k$L)
dr.k$h <- h
dr.k$k <- k
class(dr.k) <- "cve.k"
dr[[k]] <- dr.k
}
# augment result information
dr$X <- X
dr$Y <- Y
dr$method <- method
dr$call <- call
class(dr) <- "cve"
return(dr)
}
#' Loss distribution kink plot.
#'
#' @param x Object of class \code{"cve"} (result of [\code{\link{cve}}]).
#' @param ... Pass through parameters to [\code{\link{plot}}] and
#' [\code{\link{lines}}]
#'
#' @seealso see \code{\link{par}} for graphical parameters to pass through
#' as well as \code{\link{plot}}, the standard plot utility.
#' @importFrom graphics plot lines points
#' @method plot cve
#' @export
plot.cve <- function(x, ...) {
L <- c()
k <- c()
for (dr.k in x) {
if (class(dr.k) == 'cve.k') {
k <- c(k, paste0(dr.k$k))
L <- c(L, dr.k$L)
}
}
L <- matrix(L, ncol = length(k)) / var(x$Y)
boxplot(L, main = "Kink plot",
xlab = "SDR dimension",
ylab = "Sample loss distribution",
names = k)
# lines(apply(L, 2, mean)) # TODO: ?
}
#' Prints a summary of a \code{cve} result.
#' @param object Instance of 'cve' as return of \code{cve}.
#' @method summary cve
#' @export
summary.cve <- function(object, ...) {
cat('Summary of CVE result - Method: "', object$method, '"\n',
'\n',
'Dataset size: ', nrow(object$X), '\n',
'Data Dimension: ', ncol(object$X), '\n',
'SDR Dimension: ', object$k, '\n',
'loss: ', object$loss, '\n',
'\n',
'Called via:\n',
' ',
sep='')
print(object$call)
}