#' Implementation of the CVE method as a Riemann Conjugated Gradient method.
#'
#' @references A Riemannian Conjugate Gradient Algorithm with Implicit Vector
#' Transport for Optimization on the Stiefel Manifold
#' @keywords internal
#' @export
cve_rcg <- function(X, Y, k,
                         nObs = sqrt(nrow(X)),
                         h = NULL,
                         tau = 1.0,
                         tol = 1e-4,
                         rho = 1e-4, # For Armijo condition.
                         slack = 0,
                         epochs = 50L,
                         attempts = 10L,
                         max.linesearch.iter = 20L,
                         logger = NULL
) {
    # Set `grad` functions environment to enable if to find this environments
    # local variabels, needed to enable the manipulation of this local variables
    # from within `grad`.
    environment(grad) <- environment()

    # Get dimensions.
    n <- nrow(X) # Number of samples.
    p <- ncol(X) # Data dimensions
    q <- p - k   # Complement dimension of the SDR space.

    # Save initial learning rate `tau`.
    tau.init <- tau
    # Addapt tolearance for break condition.
    tol <- sqrt(2 * q) * tol

    # Estaimate bandwidth if not given.
    if (missing(h) || !is.numeric(h)) {
        h <- estimate.bandwidth(X, k, nObs)
    }

    # Compute persistent data.
    # Compute lookup indexes for symmetrie, lower/upper
    # triangular parts and vectorization.
    pair.index <- elem.pairs(seq(n))
    i <- pair.index[1, ] # `i` indices of `(i, j)` pairs
    j <- pair.index[2, ] # `j` indices of `(i, j)` pairs
    # Index of vectorized matrix, for lower and upper triangular part.
    lower <- ((i - 1) * n) + j
    upper <- ((j - 1) * n) + i

    # Create all pairewise differences of rows of `X`.
    X_diff <- X[i, , drop = F] - X[j, , drop = F]
    # Identity matrix.
    I_p <- diag(1, p)

    # Init tracking of current best (according multiple attempts).
    V.best <- NULL
    loss.best <- Inf

    # Start loop for multiple attempts.
    for (attempt in 1:attempts) {
        # Reset learning rate `tau`.
        tau <- tau.init

        # Sample a `(p, q)` dimensional matrix from the stiefel manifold as
        # optimization start value.
        V <- rStiefl(p, q)

        # Initial loss and gradient.
        loss <- Inf
        G <- grad(X, Y, V, h, loss.out = TRUE, persistent = TRUE)
        # Set last loss (aka, loss after applying the step).
        loss.last <- loss

        # Cayley transform matrix `A`
        A <- (G %*% t(V)) - (V %*% t(G))
        A.last <- A

        W <- -A
        Z <- W %*% V

        # Compute directional derivative.
        loss.prime <- sum(G * Z) # Tr(G^T Z)

        # Call logger with initial values before starting optimization.
        if (is.function(logger)) {
            epoch <- 0 # Set epoch count to 0 (only relevant for logging).
            error <- NA
            logger(environment())
        }

        ## Start optimization loop.
        for (epoch in 1:epochs) {
            # New directional derivative.
            loss.prime <- sum(G * Z)

            # Reset `tau` for step-size selection.
            tau <- tau.init
            for (iter in 1:max.linesearch.iter) {
                V.tau <- retractStiefl(V + tau * Z)
                # Loss at position after a step.
                loss <- grad(X, Y, V.tau, h,
                             loss.only = TRUE, persistent = TRUE)
                # Check Armijo condition.
                if (loss <= loss.last + (rho * tau * loss.prime)) {
                    break() # Iff fulfilled stop linesearch.
                }
                # Reduce step-size and continue linesearch.
                tau <- tau / 2
            }

            # Compute error.
            error <- norm(V %*% t(V) - V.tau %*% t(V.tau), type = "F")

            # Perform step with found step-size
            V <- V.tau
            loss.last <- loss

            # Call logger.
            if (is.function(logger)) {
                logger(environment())
            }

            # Check break condition.
            # Note: the devision by `sqrt(2 * k)` is included in `tol`.
            if (error < tol) {
                break()
            }

            # Compute Gradient at new position.
            G <- grad(X, Y, V, h, persistent = TRUE)
            # Store last `A` for `beta` computation.
            A.last <- A
            # Cayley transform matrix `A`
            A <- (G %*% t(V)) - (V %*% t(G))

            # Check 2. break condition.
            if (norm(A, type = 'F') < tol) {
                break()
            }

            # New directional derivative.
            loss.prime <- sum(G * Z)

            # Reset beta if needed.
            if (loss.prime < 0) {
                # Compute `beta` as described in paper.
                beta.FR <- (norm(A, type = 'F') / norm(A.last, type = 'F'))^2
                beta.PR <- sum(A * (A - A.last)) / norm(A.last, type = 'F')^2
                if (beta.PR < -beta.FR) {
                    beta <- -beta.FR
                } else if (abs(beta.PR) < beta.FR) {
                    beta <- beta.PR
                } else if (beta.PR > beta.FR) {
                    beta <- beta.FR
                } else {
                    beta <- 0
                }
            } else {
                beta <- 0
            }

            # Update direction.
            W <- -A + beta * W
            Z <- W %*% V
        }

        # Check if current attempt improved previous ones
        if (loss < loss.best) {
            loss.best <- loss
            V.best <- V
        }
    }

    return(list(
        loss = loss.best,
        V = V.best,
        B = null(V.best),
        h = h
    ))
}