#' Simple implementation of the CVE method. 'Simple' means that this method is
#' a classic GD method unsing no further tricks.
#'
#' @keywords internal
#' @export
cve_sgd <- function(X, Y, k,
                    nObs = sqrt(nrow(X)),
                    h = NULL,
                    tau = 0.01,
                    epochs = 50L,
                    batch.size = 16L,
                    attempts = 10L
) {
    # Get dimensions.
    n <- nrow(X) # Number of samples.
    p <- ncol(X) # Data dimensions
    q <- p - k   # Complement dimension of the SDR space.

    # Save initial learning rate `tau`.
    tau.init <- tau

    # Estaimate bandwidth if not given.
    if (missing(h) | !is.numeric(h)) {
        h <- estimate.bandwidth(X, k, nObs)
    }

    # Init a list of data indices (shuffled for batching).
    indices <- seq(n)
    I_p <- diag(1, p)

    # Init tracking of current best (according multiple attempts).
    V.best <- NULL
    loss.best <- Inf

    # Start loop for multiple attempts.
    for (attempt in 1:attempts) {
        # Reset learning rate `tau`.
        tau <- tau.init

        # Sample a `(p, q)` dimensional matrix from the stiefel manifold as
        # optimization start value.
        V <- rStiefl(p, q)

        # Repeat `epochs` times
        for (epoch in 1:epochs) {
            # Shuffle batches
            batch.shuffle <- sample(indices)

            # Make a step for each batch.
            for (start in seq(1, n, batch.size)) {
                # Select batch data indices.
                batch <- batch.shuffle[start:(start + batch.size - 1)]
                # Remove `NA`'s (when `n` isn't a multiple of `batch.size`).
                batch <- batch[!is.na(batch)]

                # Compute batch gradient.
                loss <- NULL
                G <- grad(X[batch, ], Y[batch], V, h)

                # Cayley transform matrix.
                A <- (G %*% t(V)) - (V %*% t(G))

                # Apply learning rate `tau`.
                A.tau <- tau * A
                # Parallet transport (on Stiefl manifold) into direction of `G`.
                V <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V)
            }
        }
        # Compute actuall loss after finishing optimization.
        loss <- grad(X, Y, V, h, loss.only = TRUE)
        # After each attempt, check if last attempt reached a better result.
        if (!is.null(V.best)) { # Only required if there is already a result.
            if (loss < loss.best) {
                loss.best <- loss
                V.best <- V
            }
        } else {
            loss.best <- loss
            V.best <- V
        }
    }

    return(list(
        X = X, Y = Y, k = k,
        nObs = nObs, h = h, tau = tau,
        epochs = epochs, batch = batch, attempts = attempts,
        loss = loss.best,
        V = V.best,
        B = null(V.best)
    ))
}