134 lines
4.4 KiB
R
134 lines
4.4 KiB
R
#' Simple implementation of the CVE method. 'Simple' means that this method is
|
|
#' a classic GD method unsing no further tricks.
|
|
#'
|
|
#' @keywords internal
|
|
#' @export
|
|
cve_sgdrmsprob <- function(X, Y, k,
|
|
nObs = sqrt(nrow(X)),
|
|
h = NULL,
|
|
tau = 0.1,
|
|
tol = 1e-4,
|
|
rho = 0.1,
|
|
epochs = 50L,
|
|
batch.size = 16L,
|
|
attempts = 10L,
|
|
epsilon = 1e-7,
|
|
logger = NULL
|
|
) {
|
|
# Set `grad` functions environment to enable if to find this environments
|
|
# local variabels, needed to enable the manipulation of this local variables
|
|
# from within `grad`.
|
|
environment(grad) <- environment()
|
|
|
|
# Get dimensions.
|
|
n <- nrow(X) # Number of samples.
|
|
p <- ncol(X) # Data dimensions
|
|
q <- p - k # Complement dimension of the SDR space.
|
|
|
|
# Save initial learning rate `tau`.
|
|
tau.init <- tau
|
|
# Addapt tolearance for break condition.
|
|
tol <- sqrt(2 * q) * tol
|
|
|
|
# Estaimate bandwidth if not given.
|
|
if (missing(h) || !is.numeric(h)) {
|
|
h <- estimate.bandwidth(X, k, nObs)
|
|
}
|
|
|
|
# Compute persistent data.
|
|
# Compute lookup indexes for symmetrie, lower/upper
|
|
# triangular parts and vectorization.
|
|
pair.index <- elem.pairs(seq(n))
|
|
i <- pair.index[1, ] # `i` indices of `(i, j)` pairs
|
|
j <- pair.index[2, ] # `j` indices of `(i, j)` pairs
|
|
# Index of vectorized matrix, for lower and upper triangular part.
|
|
lower <- ((i - 1) * n) + j
|
|
upper <- ((j - 1) * n) + i
|
|
|
|
# Create all pairewise differences of rows of `X`.
|
|
X_diff <- X[i, , drop = F] - X[j, , drop = F]
|
|
# Identity matrix.
|
|
I_p <- diag(1, p)
|
|
# Init a list of data indices (shuffled for batching).
|
|
indices <- seq(n)
|
|
|
|
# Init tracking of current best (according multiple attempts).
|
|
V.best <- NULL
|
|
loss.best <- Inf
|
|
|
|
# Start loop for multiple attempts.
|
|
for (attempt in 1:attempts) {
|
|
# Reset learning rate `tau`.
|
|
tau <- tau.init
|
|
|
|
# Sample a `(p, q)` dimensional matrix from the stiefel manifold as
|
|
# optimization start value.
|
|
V <- rStiefl(p, q)
|
|
# Keep track of last `V` for computing error after an epoch.
|
|
V.last <- V
|
|
|
|
if (is.function(logger)) {
|
|
loss <- grad(X, Y, V, h, loss.only = TRUE, persistent = TRUE)
|
|
error <- NA
|
|
epoch <- 0
|
|
logger(environment())
|
|
}
|
|
|
|
M <- matrix(0, p, q)
|
|
# Repeat `epochs` times
|
|
for (epoch in 1:epochs) {
|
|
# Shuffle batches
|
|
batch.shuffle <- sample(indices)
|
|
|
|
# Make a step for each batch.
|
|
for (batch.start in seq(1, n, batch.size)) {
|
|
# Select batch data indices.
|
|
batch.end <- min(batch.start + batch.size - 1, length(batch.shuffle))
|
|
batch <- batch.shuffle[batch.start:batch.end]
|
|
|
|
# Compute batch gradient.
|
|
loss <- NULL
|
|
G <- grad(X[batch, ], Y[batch], V, h, loss.out = TRUE)
|
|
|
|
# Projectd Gradient.
|
|
A <- projTangentStiefl(V, G)
|
|
# Projected element squared gradient.
|
|
Asq <- projTangentStiefl(V, G * G)
|
|
# Momentum update.
|
|
M <- (1 - rho) * Asq + rho * projTangentStiefl(V, M)
|
|
# Parallet transport (on Stiefl manifold) into direction of `G`.
|
|
V <- retractStiefl(V - tau.init * A / (sqrt(abs(M)) + epsilon))
|
|
}
|
|
# And the error for the history.
|
|
error <- norm(V.last %*% t(V.last) - V %*% t(V), type = "F")
|
|
V.last <- V
|
|
|
|
if (is.function(logger)) {
|
|
# Compute loss at end of epoch for logging.
|
|
loss <- grad(X, Y, V, h, loss.only = TRUE, persistent = TRUE)
|
|
logger(environment())
|
|
}
|
|
|
|
# Check break condition.
|
|
if (error < tol) {
|
|
break()
|
|
}
|
|
}
|
|
# Compute actual loss after finishing for comparing multiple attempts.
|
|
loss <- grad(X, Y, V, h, loss.only = TRUE, persistent = TRUE)
|
|
|
|
# After each attempt, check if last attempt reached a better result.
|
|
if (loss < loss.best) {
|
|
loss.best <- loss
|
|
V.best <- V
|
|
}
|
|
}
|
|
|
|
return(list(
|
|
loss = loss.best,
|
|
V = V.best,
|
|
B = null(V.best),
|
|
h = h
|
|
))
|
|
}
|