predict_dim_cv <- function(object) { # Get centered training data and dimensions X <- scale(object$X, center = TRUE, scale = FALSE) n <- nrow(object$X) # umber of training data samples Sigma <- (1 / n) * crossprod(X, X) eig <- eigen(Sigma) Sigma_root <- eig$vectors %*% tcrossprod(diag(sqrt(eig$values)), eig$vectors) X <- X %*% solve(Sigma_root) pred <- matrix(0, n, length(object$res)) colnames(pred) <- names(object$res) for (dr.k in object$res) { # get "name" of current dimension k <- as.character(dr.k$k) # Project dataset with current SDR basis X.proj <- X %*% dr.k$B for (i in 1:n) { model <- mda::mars(X.proj[-i, ], object$Y[-i]) pred[i, k] <- predict(model, X.proj[i, , drop = F]) } } MSE <- colMeans((pred - object$Y)^2) return(list( MSE = MSE, k = as.integer(names(which.min(MSE))) )) } predict_dim_elbow <- function(object) { # extract original data from object (cve result) X <- object$X Y <- object$Y # Get dimensions n <- nrow(X) p <- ncol(X) losses <- vector("double", length(object$res)) names(losses) <- names(object$res) # Compute per sample losses with alternative bandwidth for each dimension. for (dr.k in object$res) { # extract dimension specific estimates and dimensions. k <- dr.k$k V <- dr.k$V # estimate bandwidth according alternative formula. h <- estimate.bandwidth(X, k, sqrt(n), version = 2L) # Projected `X` XQ <- X %*% (diag(1, p) - tcrossprod(V)) # X (I - V V') # Compute distances d2 <- tcrossprod(XQ) # XQ XQ' d1 <- matrix(diag(d2), n, n) D <- d1 - 2 * d2 + t(d1) # Apply kernel # Note: CVE uses for d = ||Q(X_i - X_j)|| the kernel exp(-d^4 / (2 h^2)) K <- exp((-0.5 / h^2) * D^2) # sum columns colSumsK <- colSums(K) # compute weighted and square meighted reponses y1 <- (K %*% Y) / colSumsK y2 <- (K %*% Y^2) / colSumsK # total loss losses[[as.character(k)]] <- mean(y2 - y1^2) } return(list( losses = losses, k = as.integer(names(which.min(losses))) )) } predict_dim_wilcoxon <- function(object, p.value = 0.05) { # extract original data from object (cve result) X <- object$X Y <- object$Y # Get dimensions n <- nrow(X) p <- ncol(X) L <- matrix(NA, n, length(object$res)) colnames(L) <- names(object$res) # Compute per sample losses with alternative bandwidth for each dimension. for (dr.k in object$res) { # extract dimension specific estimates and dimensions. k <- dr.k$k V <- dr.k$V # estimate bandwidth according alternative formula. h <- estimate.bandwidth(X, k, sqrt(n), version = 2L) # Projected `X` XQ <- X %*% (diag(1, p) - tcrossprod(V)) # X (I - V V') # Compute distances d2 <- tcrossprod(XQ) # XQ XQ' d1 <- matrix(diag(d2), n, n) D <- d1 - 2 * d2 + t(d1) # Apply kernel # Note: CVE uses for d = ||Q(X_i - X_j)|| the kernel exp(-d^4 / (2 h^2)) K <- exp((-0.5 / h^2) * D^2) # sum columns colSumsK <- colSums(K) # compute weighted and square meighted reponses y1 <- (K %*% Y) / colSumsK y2 <- (K %*% Y^2) / colSumsK # element-wise L for dim. k L[, as.character(k)] <- y2 - y1^2 } for (ind in seq_len(length(object$res) - 1L)) { p.test <- wilcox.test(L[, ind], L[, ind + 1L], alternative = "less")$p.value if (p.test < p.value) { return(list( p.value = p.test, k = object$res[[ind]]$k )) } } return(list( p.value = NA, k = object$res[[length(object$res)]]$k )) } #' Estimate Dimension of Reduction Space. #' #' This function estimates the dimension of the mean dimension reduction space, #' i.e. number of columns of \eqn{B} matrix. The default method \code{'CV'} #' performs cross-validation using \code{mars}. Given #' \code{k = min.dim, ..., max.dim} a cross-validation via \code{mars} is #' performed on the dataset \eqn{(Y i, B_k' X_i)_{i = 1, ..., n}} where #' \eqn{B_k} is the \eqn{p \times k}{p x k} dimensional CVE estimate given #' \eqn{k}. The estimated SDR dimension is the \eqn{k} where the #' cross-validation mean squared error is the lowest. The method \code{'elbow'} #' estimates the dimension via \eqn{k = argmin_k L_n(V_{p − k})} where #' \eqn{V_{p − k}} is the CVE estimate of the orthogonal columnspace of #' \eqn{B_k}. Method \code{'wilcoxon'} is similar to \code{'elbow'} but finds #' the minimum using the wilcoxon-test. #' #' @param object instance of class \code{cve} (result of \code{\link{cve}}, #' \code{\link{cve.call}}). #' @param method This parameter specify which method will be used in dimension #' estimation. It provides three methods \code{'CV'} (default), \code{'elbow'}, #' and \code{'wilcoxon'} to estimate the dimension of the SDR. #' @param ... ignored. #' #' @return list with #' \describe{ #' \item{}{cretirion of method for \code{k = min.dim, ..., max.dim}.} #' \item{k}{estimated dimension as argmin over \eqn{k} of criterion.} #' } #' #' @examples #' # create B for simulation #' B <- rep(1, 5) / sqrt(5) #' #' set.seed(21) #' # creat predictor data x ~ N(0, I_p) #' x <- matrix(rnorm(500), 100) #' #' # simulate response variable #' # y = f(B'x) + err #' # with f(x1) = x1 and err ~ N(0, 0.25^2) #' y <- x %*% B + 0.25 * rnorm(100) #' #' # Calculate cve for unknown k between min.dim and max.dim. #' cve.obj.simple <- cve(y ~ x) #' #' predict_dim(cve.obj.simple) #' #' @export predict_dim <- function(object, ..., method = "CV") { # Check if there are dimensions to select. if (length(object$res) == 1L) { return(list( message = "Only one dim. estimated.", k = as.integer(names(object$res)) )) } # Determine method "fuzzy". methods <- c("cv", "elbow", "wilcoxon") names(methods) <- methods method <- methods[[tolower(method), exact = FALSE]] if (is.null(method)) { stop('Unable to determine method.') } if (method == "cv") { return(predict_dim_cv(object)) } else if (method == "elbow") { return(predict_dim_elbow(object)) } else if (method == "wilcoxon") { return(predict_dim_wilcoxon(object)) } else { stop("Unable to determine method.") } }