#' Simple implementation of the CVE method. 'Simple' means that this method is
#' a classic GD method unsing no further tricks.
#'
#' @keywords internal
#' @export
cve_simple <- function(X, Y, k,
                       nObs = sqrt(nrow(X)),
                       h = NULL,
                       tau = 1.0,
                       tol = 1e-3,
                       slack = 0,
                       epochs = 50L,
                       attempts = 10L
) {
    # Addapt tolearance for break condition
    tol <- sqrt(2 * k) * tol
    tau.init <- tau # remember to reset for new attempt

    # Get dimensions.
    n <- nrow(X)
    p <- ncol(X)
    q <- p - k

    # Estaimate bandwidth if not given.
    if (missing(h) | !is.numeric(h)) {
        h <- estimate.bandwidth(X, k, nObs)
    }

    # Compue all static data.
    X_diff <- row.pair.apply(X, `-`)
    index <- matrix(seq(n * n), n, n)
    tri.i <- row.pair.apply(index[, 1, drop = FALSE], function(i, j) { i })
    tri.j <- row.pair.apply(index[, 1, drop = FALSE], function(i, j) { j })
    lower.tri.ind <- index[lower.tri(index)]
    upper.tri.ind <- t(index)[lower.tri.ind] # ATTENTION: corret order

    I_p <- diag(1, p)

    # Init variables for best attempt
    loss.best <- Inf
    V.best <- NULL

    # Take a view attempts with different starting values
    for (attempt in 1:attempts) {

        # reset step width `tau`
        tau <- tau.init

        # Sample a `(p, q)` dimensional matrix from the stiefel manifold as
        # optimization start value.
        V <- rStiefl(p, q)

        ## Initial loss calculation
        # Orthogonal projection to `span(V)`.
        Q <- I_p - (V %*% t(V))

        # Compute vectorized distance matrix `D`.
        vecD <- rowSums((X_diff %*% Q)^2)
        # Compute weights matrix `W`
        W <- matrix(1, n, n) # init (`exp(0) = 1` in the diagonal)
        W[lower.tri.ind] <- exp(vecD / (-2 * h)) # set lower triangular part
        W[upper.tri.ind] <- t(W)[upper.tri.ind] # mirror to upper triangular part
        W <- sweep(W, 2, colSums(W), FUN = `/`) # normalize columns

        # Weighted `Y` momentums
        y1 <- Y %*% W # is 1D anyway, avoid transposing `W`
        y2 <- Y^2 %*% W

        # Get per sample loss `L(V, X_i)`
        L <- y2 - y1^2
        # Sum to tolal loss `L(V)`
        loss <- mean(L)

        ## Start optimization loop.
        for (iter in 1:epochs) {

            # index according a lower triangular matrix stored in column major order
            # by only the `i` or `j` index.
            # vecW <- lower.tri.vector(W) + upper.tri.vector(W)
            vecW <- W[lower.tri.ind] + W[upper.tri.ind]
            S <- (L[tri.j] - (Y[tri.i] - y1[tri.j])^2) * vecW * vecD

            # Gradient
            G <- t(X_diff) %*% sweep(X_diff %*% V, 1, S, `*`);
            G <- (-2 / (n * h^2)) * G

            # Cayley transform matrix `A`
            A <- (G %*% t(V)) - (V %*% t(G))

            # Compute next `V` by step size `tau` unsing the Cayley transform
            # via a parallel transport into the gradient direction.
            A.tau <- tau * A
            V.tau <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V)

            # Orthogonal projection to `span(V.tau)`.
            Q <- I_p - (V.tau %*% t(V.tau))

            # Compute vectorized distance matrix `D`.
            vecD <- rowSums((X_diff %*% Q)^2)
            # Compute weights matrix `W` (only update values, diag keeps 1's)
            W[lower.tri.ind] <- exp(vecD / (-2 * h)) # set lower triangular part
            W[upper.tri.ind] <- t(W)[upper.tri.ind] # mirror to upper triangular part
            W <- sweep(W, 2, colSums(W), FUN = `/`) # normalize columns

            # Weighted `Y` momentums
            y1 <- Y %*% W # is 1D anyway, avoid transposing `W`
            y2 <- Y^2 %*% W

            # Get per sample loss `L(V, X_i)`
            L <- y2 - y1^2
            # Sum to tolal loss `L(V)`
            loss.tau <- mean(L)

            # Check if step is appropriate
            if (loss != Inf & loss.tau - loss > slack * loss) {
                tau <- tau / 2
            } else {
                loss <- loss.tau
                V <- V.tau
            }
        }

        # Check if current attempt improved previous ones
        if (loss.tau < loss.best) {
            loss.best <- loss.tau
            V.best <- V.tau
        }

    }

    return(list(
        loss = loss.best,
        V = V.best,
        B = null(V.best),
        h = h
    ))
}