2
0
Fork 0
CVE/CVE_R/R/cve_sgd.R

127 lines
4.1 KiB
R

#' Simple implementation of the CVE method. 'Simple' means that this method is
#' a classic GD method unsing no further tricks.
#'
#' @keywords internal
#' @export
cve_sgd <- function(X, Y, k,
nObs = sqrt(nrow(X)),
h = NULL,
tau = 0.01,
tol = 1e-3,
epochs = 50L,
batch.size = 16L,
attempts = 10L
) {
# Set `grad` functions environment to enable if to find this environments
# local variabels, needed to enable the manipulation of this local variables
# from within `grad`.
environment(grad) <- environment()
# Setup histories.
loss.history <- matrix(NA, epochs, attempts)
error.history <- matrix(NA, epochs, attempts)
# Get dimensions.
n <- nrow(X) # Number of samples.
p <- ncol(X) # Data dimensions
q <- p - k # Complement dimension of the SDR space.
# Save initial learning rate `tau`.
tau.init <- tau
# Addapt tolearance for break condition.
tol <- sqrt(2 * q) * tol
# Estaimate bandwidth if not given.
if (missing(h) | !is.numeric(h)) {
h <- estimate.bandwidth(X, k, nObs)
}
# Compute persistent data.
# Compute lookup indexes for symmetrie, lower/upper
# triangular parts and vectorization.
pair.index <- elem.pairs(seq(n))
i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
# Matrix of vectorized indices. (vec(index) -> seq)
index <- matrix(seq(n * n), n, n)
lower <- index[lower.tri(index)]
upper <- t(index)[lower]
# Create all pairewise differences of rows of `X`.
X_diff <- X[i, , drop = F] - X[j, , drop = F]
# Identity matrix.
I_p <- diag(1, p)
# Init a list of data indices (shuffled for batching).
indices <- seq(n)
# Init tracking of current best (according multiple attempts).
V.best <- NULL
loss.best <- Inf
# Start loop for multiple attempts.
for (attempt in 1:attempts) {
# Reset learning rate `tau`.
tau <- tau.init
# Sample a `(p, q)` dimensional matrix from the stiefel manifold as
# optimization start value.
V <- rStiefl(p, q)
# Keep track of last `V` for computing error after an epoch.
V.last <- V
# Repeat `epochs` times
for (epoch in 1:epochs) {
# Shuffle batches
batch.shuffle <- sample(indices)
# Make a step for each batch.
for (start in seq(1, n, batch.size)) {
# Select batch data indices.
batch <- batch.shuffle[start:(start + batch.size - 1)]
# Remove `NA`'s (when `n` isn't a multiple of `batch.size`).
batch <- batch[!is.na(batch)]
# Compute batch gradient.
loss <- NULL
G <- grad(X[batch, ], Y[batch], V, h, loss.out = TRUE)
# Cayley transform matrix.
A <- (G %*% t(V)) - (V %*% t(G))
# Apply learning rate `tau`.
A.tau <- tau * A
# Parallet transport (on Stiefl manifold) into direction of `G`.
V <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V)
}
# Compute actuall loss after finishing optimization.
loss <- grad(X, Y, V, h, loss.only = TRUE, persistent = TRUE)
# And the error for the history.
error <- norm(V.last %*% t(V.last) - V %*% t(V), type = "F")
V.last <- V
# Finaly write history.
loss.history[epoch, attempt] <- loss
error.history[epoch, attempt] <- error
# Check break condition.
if (error < tol) {
break()
}
}
# After each attempt, check if last attempt reached a better result.
if (loss < loss.best) {
loss.best <- loss
V.best <- V
}
}
return(list(
loss.history = loss.history,
error.history = error.history,
loss = loss.best,
V = V.best,
B = null(V.best),
h = h
))
}