2019-12-10 07:45:07 +00:00
predict_dim_cv <- function ( object ) {
# Get centered training data and dimensions
X <- scale ( object $ X , center = TRUE , scale = FALSE )
n <- nrow ( object $ X ) # umber of training data samples
Sigma <- ( 1 / n ) * crossprod ( X , X )
eig <- eigen ( Sigma )
Sigma_root <- eig $ vectors %*% tcrossprod ( diag ( sqrt ( eig $ values ) ) , eig $ vectors )
X <- X %*% solve ( Sigma_root )
pred <- matrix ( 0 , n , length ( object $ res ) )
colnames ( pred ) <- names ( object $ res )
for ( dr.k in object $ res ) {
# get "name" of current dimension
k <- as.character ( dr.k $ k )
# Project dataset with current SDR basis
X.proj <- X %*% dr.k $ B
for ( i in 1 : n ) {
model <- mda :: mars ( X.proj [ - i , ] , object $ Y [ - i ] )
pred [i , k ] <- predict ( model , X.proj [i , , drop = F ] )
}
}
MSE <- colMeans ( ( pred - object $ Y ) ^2 )
return ( list (
MSE = MSE ,
k = as.integer ( names ( which.min ( MSE ) ) )
) )
}
2019-12-17 20:43:14 +00:00
2019-12-10 07:45:07 +00:00
predict_dim_elbow <- function ( object ) {
# extract original data from object (cve result)
X <- object $ X
Y <- object $ Y
# Get dimensions
n <- nrow ( X )
p <- ncol ( X )
losses <- vector ( " double" , length ( object $ res ) )
names ( losses ) <- names ( object $ res )
# Compute per sample losses with alternative bandwidth for each dimension.
for ( dr.k in object $ res ) {
# extract dimension specific estimates and dimensions.
k <- dr.k $ k
V <- dr.k $ V
2019-12-16 16:34:35 +00:00
# estimate bandwidth according alternative formula.
2019-12-10 07:45:07 +00:00
h <- estimate.bandwidth ( X , k , sqrt ( n ) , version = 2L )
# Projected `X`
2019-12-16 16:34:35 +00:00
XQ <- X %*% ( diag ( 1 , p ) - tcrossprod ( V ) ) # X (I - V V')
# Compute distances
d2 <- tcrossprod ( XQ ) # XQ XQ'
d1 <- matrix ( diag ( d2 ) , n , n )
D <- d1 - 2 * d2 + t ( d1 )
2019-12-10 07:45:07 +00:00
# Apply kernel
2019-12-16 16:34:35 +00:00
# Note: CVE uses for d = ||Q(X_i - X_j)|| the kernel exp(-d^4 / (2 h^2))
2019-12-10 07:45:07 +00:00
K <- exp ( ( -0.5 / h ^2 ) * D ^2 )
# sum columns
colSumsK <- colSums ( K )
# compute weighted and square meighted reponses
y1 <- ( K %*% Y ) / colSumsK
y2 <- ( K %*% Y ^2 ) / colSumsK
# total loss
losses [ [as.character ( k ) ] ] <- mean ( y2 - y1 ^2 )
}
return ( list (
losses = losses ,
k = as.integer ( names ( which.min ( losses ) ) )
) )
}
predict_dim_wilcoxon <- function ( object , p.value = 0.05 ) {
# extract original data from object (cve result)
X <- object $ X
Y <- object $ Y
# Get dimensions
n <- nrow ( X )
p <- ncol ( X )
2019-12-16 16:34:35 +00:00
2019-12-10 07:45:07 +00:00
L <- matrix ( NA , n , length ( object $ res ) )
colnames ( L ) <- names ( object $ res )
# Compute per sample losses with alternative bandwidth for each dimension.
for ( dr.k in object $ res ) {
# extract dimension specific estimates and dimensions.
k <- dr.k $ k
V <- dr.k $ V
2019-12-16 16:34:35 +00:00
# estimate bandwidth according alternative formula.
2019-12-10 07:45:07 +00:00
h <- estimate.bandwidth ( X , k , sqrt ( n ) , version = 2L )
# Projected `X`
2019-12-16 16:34:35 +00:00
XQ <- X %*% ( diag ( 1 , p ) - tcrossprod ( V ) ) # X (I - V V')
# Compute distances
d2 <- tcrossprod ( XQ ) # XQ XQ'
d1 <- matrix ( diag ( d2 ) , n , n )
D <- d1 - 2 * d2 + t ( d1 )
2019-12-10 07:45:07 +00:00
# Apply kernel
2019-12-16 16:34:35 +00:00
# Note: CVE uses for d = ||Q(X_i - X_j)|| the kernel exp(-d^4 / (2 h^2))
2019-12-10 07:45:07 +00:00
K <- exp ( ( -0.5 / h ^2 ) * D ^2 )
# sum columns
colSumsK <- colSums ( K )
# compute weighted and square meighted reponses
y1 <- ( K %*% Y ) / colSumsK
y2 <- ( K %*% Y ^2 ) / colSumsK
# element-wise L for dim. k
L [ , as.character ( k ) ] <- y2 - y1 ^2
}
for ( ind in seq_len ( length ( object $ res ) - 1L ) ) {
p.test <- wilcox.test ( L [ , ind ] , L [ , ind + 1L ] ,
alternative = " less" ) $ p.value
if ( p.test < p.value ) {
return ( list (
p.value = p.test ,
k = object $ res [ [ind ] ] $ k
) )
}
}
return ( list (
p.value = NA ,
k = object $ res [ [length ( object $ res ) ] ] $ k
) )
}
2019-12-17 20:43:14 +00:00
#' Estimate Dimension of Reduction Space.
#'
#' This function estimates the dimension of the mean dimension reduction space,
#' i.e. number of columns of \eqn{B} matrix. The default method \code{'CV'}
2019-12-20 08:40:46 +00:00
#' performs l.o.o cross-validation using \code{mars}. Given
2019-12-17 20:43:14 +00:00
#' \code{k = min.dim, ..., max.dim} a cross-validation via \code{mars} is
2019-12-20 08:40:46 +00:00
#' performed on the dataset \eqn{(Y_i, B_k' X_i)_{i = 1, ..., n}} where
#' \eqn{B_k} is the \eqn{p \times k}{p x k} dimensional CVE estimate. The
#' estimated SDR dimension is the \eqn{k} where the
#' cross-validation mean squared error is minimal. The method \code{'elbow'}
#' estimates the dimension via \eqn{k = argmin_k L_n(V_{p - k})} where
#' \eqn{V_{p - k}} is space that is orthogonal to the columns-space of the CVE estimate of \eqn{B_k}. Method \code{'wilcoxon'} is similar to \code{'elbow'}
#' but finds the minimum using the wilcoxon-test.
2019-12-10 07:45:07 +00:00
#'
2019-12-20 08:40:46 +00:00
#' @param object an object of class \code{"cve"}, usually, a result of a call to
#' \code{\link{cve}} or \code{\link{cve.call}}.
2019-12-17 20:43:14 +00:00
#' @param method This parameter specify which method will be used in dimension
#' estimation. It provides three methods \code{'CV'} (default), \code{'elbow'},
#' and \code{'wilcoxon'} to estimate the dimension of the SDR.
2019-11-22 08:32:14 +00:00
#' @param ... ignored.
2019-11-25 19:49:43 +00:00
#'
2019-12-17 20:43:14 +00:00
#' @return list with
#' \describe{
#' \item{}{cretirion of method for \code{k = min.dim, ..., max.dim}.}
#' \item{k}{estimated dimension as argmin over \eqn{k} of criterion.}
#' }
2019-12-10 07:45:07 +00:00
#'
2019-12-05 16:35:29 +00:00
#' @examples
#' # create B for simulation
#' B <- rep(1, 5) / sqrt(5)
#'
#' set.seed(21)
#' # creat predictor data x ~ N(0, I_p)
#' x <- matrix(rnorm(500), 100)
#'
#' # simulate response variable
#' # y = f(B'x) + err
#' # with f(x1) = x1 and err ~ N(0, 0.25^2)
#' y <- x %*% B + 0.25 * rnorm(100)
#'
#' # Calculate cve for unknown k between min.dim and max.dim.
#' cve.obj.simple <- cve(y ~ x)
#'
#' predict_dim(cve.obj.simple)
#'
2019-11-25 19:49:43 +00:00
#' @export
2019-12-10 07:45:07 +00:00
predict_dim <- function ( object , ... , method = " CV" ) {
# Check if there are dimensions to select.
if ( length ( object $ res ) == 1L ) {
return ( list (
message = " Only one dim. estimated." ,
k = as.integer ( names ( object $ res ) )
) )
}
2019-11-22 08:32:14 +00:00
2019-12-10 07:45:07 +00:00
# Determine method "fuzzy".
methods <- c ( " cv" , " elbow" , " wilcoxon" )
names ( methods ) <- methods
method <- methods [ [tolower ( method ) , exact = FALSE ] ]
if ( is.null ( method ) ) {
stop ( ' Unable to determine method.' )
2019-11-22 08:32:14 +00:00
}
2019-12-10 07:45:07 +00:00
if ( method == " cv" ) {
return ( predict_dim_cv ( object ) )
} else if ( method == " elbow" ) {
return ( predict_dim_elbow ( object ) )
} else if ( method == " wilcoxon" ) {
return ( predict_dim_wilcoxon ( object ) )
} else {
stop ( " Unable to determine method." )
}
2019-11-22 08:32:14 +00:00
}