#' Simple implementation of the CVE method. 'Simple' means that this method is #' a classic GD method unsing no further tricks. #' #' @keywords internal #' @export cve_sgd <- function(X, Y, k, nObs = sqrt(nrow(X)), h = NULL, tau = 0.01, tol = 1e-3, epochs = 50L, batch.size = 16L, attempts = 10L ) { # Set `grad` functions environment to enable if to find this environments # local variabels, needed to enable the manipulation of this local variables # from within `grad`. environment(grad) <- environment() # Setup histories. loss.history <- matrix(NA, epochs, attempts) error.history <- matrix(NA, epochs, attempts) # Get dimensions. n <- nrow(X) # Number of samples. p <- ncol(X) # Data dimensions q <- p - k # Complement dimension of the SDR space. # Save initial learning rate `tau`. tau.init <- tau # Addapt tolearance for break condition. tol <- sqrt(2 * q) * tol # Estaimate bandwidth if not given. if (missing(h) | !is.numeric(h)) { h <- estimate.bandwidth(X, k, nObs) } # Compute persistent data. # Compute lookup indexes for symmetrie, lower/upper # triangular parts and vectorization. pair.index <- elem.pairs(seq(n)) i <- pair.index[, 1] # `i` indices of `(i, j)` pairs j <- pair.index[, 2] # `j` indices of `(i, j)` pairs # Matrix of vectorized indices. (vec(index) -> seq) index <- matrix(seq(n * n), n, n) lower <- index[lower.tri(index)] upper <- t(index)[lower] # Create all pairewise differences of rows of `X`. X_diff <- X[i, , drop = F] - X[j, , drop = F] # Identity matrix. I_p <- diag(1, p) # Init a list of data indices (shuffled for batching). indices <- seq(n) # Init tracking of current best (according multiple attempts). V.best <- NULL loss.best <- Inf # Start loop for multiple attempts. for (attempt in 1:attempts) { # Reset learning rate `tau`. tau <- tau.init # Sample a `(p, q)` dimensional matrix from the stiefel manifold as # optimization start value. V <- rStiefl(p, q) # Keep track of last `V` for computing error after an epoch. V.last <- V # Repeat `epochs` times for (epoch in 1:epochs) { # Shuffle batches batch.shuffle <- sample(indices) # Make a step for each batch. for (start in seq(1, n, batch.size)) { # Select batch data indices. batch <- batch.shuffle[start:(start + batch.size - 1)] # Remove `NA`'s (when `n` isn't a multiple of `batch.size`). batch <- batch[!is.na(batch)] # Compute batch gradient. loss <- NULL G <- grad(X[batch, ], Y[batch], V, h, loss.out = TRUE) # Cayley transform matrix. A <- (G %*% t(V)) - (V %*% t(G)) # Apply learning rate `tau`. A.tau <- tau * A # Parallet transport (on Stiefl manifold) into direction of `G`. V <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V) } # Compute actuall loss after finishing optimization. loss <- grad(X, Y, V, h, loss.only = TRUE, persistent = TRUE) # And the error for the history. error <- norm(V.last %*% t(V.last) - V %*% t(V), type = "F") V.last <- V # Finaly write history. loss.history[epoch, attempt] <- loss error.history[epoch, attempt] <- error # Check break condition. if (error < tol) { break() } } # After each attempt, check if last attempt reached a better result. if (loss < loss.best) { loss.best <- loss V.best <- V } } return(list( loss.history = loss.history, error.history = error.history, loss = loss.best, V = V.best, B = null(V.best), h = h )) }