2
0
Fork 0
CVE/CVE_C/R/predict_dim.R

201 lines
5.8 KiB
R

predict_dim_cv <- function(object) {
# Get centered training data and dimensions
X <- scale(object$X, center = TRUE, scale = FALSE)
n <- nrow(object$X) # umber of training data samples
Sigma <- (1 / n) * crossprod(X, X)
eig <- eigen(Sigma)
Sigma_root <- eig$vectors %*% tcrossprod(diag(sqrt(eig$values)), eig$vectors)
X <- X %*% solve(Sigma_root)
pred <- matrix(0, n, length(object$res))
colnames(pred) <- names(object$res)
for (dr.k in object$res) {
# get "name" of current dimension
k <- as.character(dr.k$k)
# Project dataset with current SDR basis
X.proj <- X %*% dr.k$B
for (i in 1:n) {
model <- mda::mars(X.proj[-i, ], object$Y[-i])
pred[i, k] <- predict(model, X.proj[i, , drop = F])
}
}
MSE <- colMeans((pred - object$Y)^2)
return(list(
MSE = MSE,
k = as.integer(names(which.min(MSE)))
))
}
# TODO: write doc
predict_dim_elbow <- function(object) {
# extract original data from object (cve result)
X <- object$X
Y <- object$Y
# Get dimensions
n <- nrow(X)
p <- ncol(X)
# Compute persistent data.
i = rep(1:n, n)
j = rep(1:n, each = n)
D.eucl = matrix((X[i, ] - X[j, ])^2 %*% rep(1, p), n)
losses <- vector("double", length(object$res))
names(losses) <- names(object$res)
# Compute per sample losses with alternative bandwidth for each dimension.
for (dr.k in object$res) {
# extract dimension specific estimates and dimensions.
k <- dr.k$k
V <- dr.k$V
q <- ncol(V)
# estimate bandwidth according alternative formula (see: TODO: see)
h <- estimate.bandwidth(X, k, sqrt(n), version = 2L)
# Projected `X`
XV <- X %*% V
# Devectorized distance matrix
# (inefficient in R but fast in C)
D <- matrix((XV[i, , drop = F] - XV[j, , drop = F])^2 %*% rep(1, q), n)
D <- D.eucl - D
# Apply kernel
K <- exp((-0.5 / h^2) * D^2)
# sum columns
colSumsK <- colSums(K)
# compute weighted and square meighted reponses
y1 <- (K %*% Y) / colSumsK
y2 <- (K %*% Y^2) / colSumsK
# total loss
losses[[as.character(k)]] <- mean(y2 - y1^2)
}
return(list(
losses = losses,
k = as.integer(names(which.min(losses)))
))
}
predict_dim_wilcoxon <- function(object, p.value = 0.05) {
# extract original data from object (cve result)
X <- object$X
Y <- object$Y
# Get dimensions
n <- nrow(X)
p <- ncol(X)
# Compute persistent data.
i = rep(1:n, n)
j = rep(1:n, each = n)
D.eucl = matrix((X[i, ] - X[j, ])^2 %*% rep(1, p), n)
L <- matrix(NA, n, length(object$res))
colnames(L) <- names(object$res)
# Compute per sample losses with alternative bandwidth for each dimension.
for (dr.k in object$res) {
# extract dimension specific estimates and dimensions.
k <- dr.k$k
V <- dr.k$V
q <- ncol(V)
# estimate bandwidth according alternative formula (see: TODO: see)
h <- estimate.bandwidth(X, k, sqrt(n), version = 2L)
# Projected `X`
XV <- X %*% V
# Devectorized distance matrix
# (inefficient in R but fast in C)
D <- matrix((XV[i, , drop = F] - XV[j, , drop = F])^2 %*% rep(1, q), n)
D <- D.eucl - D
# Apply kernel
K <- exp((-0.5 / h^2) * D^2)
# sum columns
colSumsK <- colSums(K)
# compute weighted and square meighted reponses
y1 <- (K %*% Y) / colSumsK
y2 <- (K %*% Y^2) / colSumsK
# element-wise L for dim. k
L[, as.character(k)] <- y2 - y1^2
}
for (ind in seq_len(length(object$res) - 1L)) {
p.test <- wilcox.test(L[, ind], L[, ind + 1L],
alternative = "less")$p.value
if (p.test < p.value) {
return(list(
p.value = p.test,
k = object$res[[ind]]$k
))
}
}
return(list(
p.value = NA,
k = object$res[[length(object$res)]]$k
))
}
#' Predicts SDR dimension using \code{\link[mda]{mars}} via a Cross-Validation.
#' TODO: rewrite!!!
#'
#' @param object instance of class \code{cve} (result of \code{cve},
#' \code{cve.call}).
#' @param ... ignored.
#'
#' @return list with
#' \itemize{
#' \item MSE: Mean Square Error,
#' \item k: predicted dimensions.
#' }
#'
#' @section cv:
#' Cross-validation ... TODO:
#'
#' @section elbow:
#' Cross-validation ... TODO:
#'
#' @section wilcoxon:
#' Cross-validation ... TODO:
#'
#' @examples
#' # create B for simulation
#' B <- rep(1, 5) / sqrt(5)
#'
#' set.seed(21)
#' # creat predictor data x ~ N(0, I_p)
#' x <- matrix(rnorm(500), 100)
#'
#' # simulate response variable
#' # y = f(B'x) + err
#' # with f(x1) = x1 and err ~ N(0, 0.25^2)
#' y <- x %*% B + 0.25 * rnorm(100)
#'
#' # Calculate cve for unknown k between min.dim and max.dim.
#' cve.obj.simple <- cve(y ~ x)
#'
#' predict_dim(cve.obj.simple)
#'
#' @export
predict_dim <- function(object, ..., method = "CV") {
# Check if there are dimensions to select.
if (length(object$res) == 1L) {
return(list(
message = "Only one dim. estimated.",
k = as.integer(names(object$res))
))
}
# Determine method "fuzzy".
methods <- c("cv", "elbow", "wilcoxon")
names(methods) <- methods
method <- methods[[tolower(method), exact = FALSE]]
if (is.null(method)) {
stop('Unable to determine method.')
}
if (method == "cv") {
return(predict_dim_cv(object))
} else if (method == "elbow") {
return(predict_dim_elbow(object))
} else if (method == "wilcoxon") {
return(predict_dim_wilcoxon(object))
} else {
stop("Unable to determine method.")
}
}