#' Simple implementation of the CVE method. 'Simple' means that this method is
#' a classic GD method unsing no further tricks.
#'
#' @keywords internal
#' @export
cve_simple <- function(X, Y, k,
                       nObs = sqrt(nrow(X)),
                       h = NULL,
                       tau = 1.0,
                       tol = 1e-3,
                       slack = 0,
                       epochs = 50L,
                       attempts = 10L
) {
    # Set `grad` functions environment to enable if to find this environments
    # local variabels, needed to enable the manipulation of this local variables
    # from within `grad`.
    environment(grad) <- environment()

    # Setup loss histroy.
    loss.history <- matrix(NA, epochs, attempts);

    # Get dimensions.
    n <- nrow(X)
    p <- ncol(X)
    q <- p - k

    # Save initial learning rate `tau`.
    tau.init <- tau
    # Addapt tolearance for break condition.
    tol <- sqrt(2 * q) * tol

    # Estaimate bandwidth if not given.
    if (missing(h) | !is.numeric(h)) {
        h <- estimate.bandwidth(X, k, nObs)
    }

    I_p <- diag(1, p)

    # Init tracking of current best (according multiple attempts).
    V.best <- NULL
    loss.best <- Inf

    # Start loop for multiple attempts.
    for (attempt in 1:attempts) {

        # reset step width `tau`
        tau <- tau.init

        # Sample a `(p, q)` dimensional matrix from the stiefel manifold as
        # optimization start value.
        V <- rStiefl(p, q)

        # Initial loss and gradient.
        loss <- Inf
        G <- grad(X, Y, V, h, loss.out = TRUE) # `loss.out=T` sets `loss`!
        # Set last loss (aka, loss after applying the step).
        loss.last <- loss

        # Cayley transform matrix `A`
        A <- (G %*% t(V)) - (V %*% t(G))

        ## Start optimization loop.
        for (epoch in 1:epochs) {
            # Apply learning rate `tau`.
            A.tau <- tau * A
            # Parallet transport (on Stiefl manifold) into direction of `G`.
            V.tau <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V)

            # Loss at position after a step.
            loss <- grad(X, Y, V.tau, h, loss.only = TRUE)

            # Check if step is appropriate
            if ((loss - loss.last) > slack * loss.last) {
                tau <- tau / 2
                next() # Keep position and try with smaller `tau`.
            }

            # Compute error.
            error <- norm(V %*% t(V) - V.tau %*% t(V.tau), type = "F")
            # Check break condition (epoch check to skip ignored gradient calc).
            # Note: the devision by `sqrt(2 * k)` is included in `tol`.
            if (error < tol | epoch >= epochs) {
                # take last step and stop optimization.
                V <- V.tau
                break()
            }

            # Perform the step and remember previous loss.
            V <- V.tau
            loss.last <- loss

            # Compute gradient at new position.
            # Note: `loss` will be updated too!
            G <- grad(X, Y, V, h, loss.out = TRUE, loss.log = TRUE)

            # Cayley transform matrix `A`
            A <- (G %*% t(V)) - (V %*% t(G))
        }

        # Check if current attempt improved previous ones
        if (loss < loss.best) {
            loss.best <- loss
            V.best <- V
        }

    }

    return(list(
        loss.history = loss.history,
        loss = loss.best,
        V = V.best,
        B = null(V.best),
        h = h
    ))
}