#' Implementation of the CVE method using curvilinear linesearch with Armijo-Wolfe
#' conditions.
#'
#' @keywords internal
#' @export
cve_linesearch <- function(X, Y, k,
                           nObs = sqrt(nrow(X)),
                           h = NULL,
                           tau = 1.0,
                           tol = 1e-3,
                           rho1 = 0.1,
                           rho2 = 0.9,
                           slack = 0,
                           epochs = 50L,
                           attempts = 10L,
                           max.linesearch.iter = 10L,
                           logger = NULL
) {
    # Set `grad` functions environment to enable if to find this environments
    # local variabels, needed to enable the manipulation of this local variables
    # from within `grad`.
    environment(grad) <- environment()

    # Get dimensions.
    n <- nrow(X)
    p <- ncol(X)
    q <- p - k

    # Save initial learning rate `tau`.
    tau.init <- tau
    # Addapt tolearance for break condition.
    tol <- sqrt(2 * q) * tol

    # Estaimate bandwidth if not given.
    if (missing(h) | !is.numeric(h)) {
        h <- estimate.bandwidth(X, k, nObs)
    }

    # Compute persistent data.
    # Compute lookup indexes for symmetrie, lower/upper
    # triangular parts and vectorization.
    pair.index <- elem.pairs(seq(n))
    i <- pair.index[1, ] # `i` indices of `(i, j)` pairs
    j <- pair.index[2, ] # `j` indices of `(i, j)` pairs
    # Matrix of vectorized indices. (vec(index) -> seq)
    index <- matrix(seq(n * n), n, n)
    lower <- index[lower.tri(index)]
    upper <- t(index)[lower]

    # Create all pairewise differences of rows of `X`.
    X_diff <- X[i, , drop = F] - X[j, , drop = F]
    # Identity matrix.
    I_p <- diag(1, p)

    # Init tracking of current best (according multiple attempts).
    V.best <- NULL
    loss.best <- Inf

    # Start loop for multiple attempts.
    for (attempt in 1:attempts) {

        # Sample a `(p, q)` dimensional matrix from the stiefel manifold as
        # optimization start value.
        V <- rStiefl(p, q)

        # Initial loss and gradient.
        loss <- Inf
        G <- grad(X, Y, V, h, loss.out = TRUE, persistent = TRUE)
        # Set last loss (aka, loss after applying the step).
        loss.last <- loss

        # Call logger with initial values before starting optimization.
        if (is.function(logger)) {
            epoch <- 0 # Set epoch count to 0 (only relevant for logging).
            error <- NA
            logger(environment())
        }

        ## Start optimization loop.
        for (epoch in 1:epochs) {

            # Cayley transform matrix `A`
            A <- (G %*% t(V)) - (V %*% t(G))

            # Directional derivative of the loss at current position, given
            # as `Tr(G^T \cdot A \cdot V)`.
            loss.prime <- -0.5 * norm(A, type = 'F')^2

            # Linesearch
            tau.upper <- Inf
            tau.lower <- 0
            tau <- tau.init
            for (iter in 1:max.linesearch.iter) {
                # Apply learning rate `tau`.
                A.tau <- (tau / 2) * A
                # Parallet transport (on Stiefl manifold) into direction of `G`.
                inv <- solve(I_p + A.tau)
                V.tau <- inv %*% ((I_p - A.tau) %*% V)

                # Loss at position after a step.
                loss <- Inf # aka loss.tau
                G.tau <- grad(X, Y, V.tau, h, loss.out = TRUE, persistent = TRUE)

                # Armijo condition.
                if (loss > loss.last + (rho1 * tau * loss.prime)) {
                    tau.upper <- tau
                    tau <- (tau.lower + tau.upper) / 2
                    next()
                }

                V.prime.tau <- -0.5 * inv %*% A %*% (V + V.tau)
                loss.prime.tau <- sum(G * V.prime.tau) # Tr(grad(tau)^T \cdot Y^'(tau))

                # Wolfe condition.
                if (loss.prime.tau < rho2 * loss.prime) {
                    tau.lower <- tau
                    if (tau.upper == Inf) {
                        tau <- 2 * tau.lower
                    } else {
                        tau <- (tau.lower + tau.upper) / 2
                    }
                } else {
                    break()
                }
            }

            # Compute error.
            error <- norm(V %*% t(V) - V.tau %*% t(V.tau), type = "F")

            # Check break condition (epoch check to skip ignored gradient calc).
            # Note: the devision by `sqrt(2 * k)` is included in `tol`.
            if (error < tol | epoch >= epochs) {
                # take last step and stop optimization.
                V <- V.tau
                # Final call to the logger before stopping optimization
                if (is.function(logger)) {
                    G <- G.tau
                    logger(environment())
                }
                break()
            }

            # Perform the step and remember previous loss.
            V <- V.tau
            loss.last <- loss
            G <- G.tau

            # Log after taking current step.
            if (is.function(logger)) {
                logger(environment())
            }

        }

        # Check if current attempt improved previous ones
        if (loss < loss.best) {
            loss.best <- loss
            V.best <- V
        }

    }

    return(list(
        loss = loss.best,
        V = V.best,
        B = null(V.best),
        h = h
    ))
}