#' Euclidean vector norm (2-norm)
#'
#' @param x Numeric vector
#' @return Numeric
norm2 <- function(x) { return(sum(x^2)) }

#' Samples uniform from the Stiefel Manifold
#'
#' @param p row dim.
#' @param q col dim.
#' @return `(p, q)` semi-orthogonal matrix
rStiefl <- function(p, q) {
    return(qr.Q(qr(matrix(rnorm(p * q, 0, 1), p, q))))
}

#' Matrix Trace
#'
#' @param M Square matrix
#' @return Trace \eqn{Tr(M)}
Tr <- function(M) {
    return(sum(diag(M)))
}

#' Null space basis of given matrix `B`
#'
#' @param B `(p, q)` matrix
#' @return Semi-orthogonal `(p, p - q)` matrix `Q` spaning the null space of `B`
null <- function(M) {
    tmp <- qr(M)
    set <- if(tmp$rank == 0L) seq_len(ncol(M)) else -seq_len(tmp$rank)
    return(qr.Q(tmp, complete = TRUE)[, set, drop = FALSE])
}

####
#chooses bandwith h according to formula in paper
#dim...dimension of X vector
#k... row dim of V (dim times q matrix) corresponding to a basis of orthogonal complement of B in model
# N...sample size
#nObs... nObs in bandwith formula
#tr...trace of sample covariance matrix of X
estimateBandwidth<-function(X, k, nObs) {
    n <- nrow(X)
    p <- ncol(X)

    X_centered <- scale(X, center = TRUE, scale = FALSE)
    Sigma <- (1 / n) * t(X_centered) %*% X_centered

    quantil <- qchisq((nObs - 1) / (n - 1), k)
    return(2 * quantil * Tr(Sigma) / p)
}

###########
# evaluates L(V) and returns L_n(V),(L_tilde_n(V,X_i))_{i=1,..,n} and grad_V L_n(V) (p times k) 
# V... (dim times q) matrix 
# Xl... output of Xl_fun
# dtemp...vector with pairwise distances |X_i - X_j|
# q...output of q_ind function
# Y... vector with N Y_i values
# if grad=T, gradient of L(V) also returned
LV <- function(V, Xl, dtemp, h, q, Y, grad = TRUE) {
  N <- length(Y)
  k <- if (is.vector(V)) { 1 } else { ncol(V) }
  Xlv <- Xl %*% V
  d <- dtemp - ((Xlv^2) %*% rep(1, k))
  w <- dnorm(d / h) / dnorm(0)
  w <- matrix(w, N, q)
  w <- apply(w, 2, function(x) { x / sum(x) })
  y1 <- t(w) %*% Y
  y2 <- t(w) %*% (Y^2)
  sig <- y2 - y1^2
  
  result <- list(var = mean(sig), sig = sig)
  if (grad == TRUE) {
    tmp1 <- (kronecker(sig, rep(1, N)) - (as.vector(kronecker(rep(1, q), Y)) - kronecker(y1, rep(1, N)))^2)
    if (k == 1) {
      grad_d <- -2 * Xl * as.vector(Xlv)
      grad <- (1 / h^2) * (1 / q) * t(grad_d * as.vector(d) * as.vector(w)) %*% tmp1
    } else {
      grad <- matrix(0, nrow(V), ncol(V))
      for (j in 1:k) {
        grad_d <- -2 * Xl * as.vector(Xlv[ ,j])
        grad[ ,j] <- (1 / h^2) * (1 / q) * t(grad_d * as.vector(d) * as.vector(w)) %*% tmp1
      }
    }
    result$grad = grad
  }

  return(result)
}
#### performs stiefle optimization of argmin_{V : V'V=I_k} L_n(V)
#through curvilinear search with k0 starting values drawn uniformly on stiefel maniquefold
#dat...(N times dim+1) matrix with first column corresponding to Y values, the other columns 
#consists of X data matrix, (i.e. dat=cbind(Y,X))
#h... bandwidth
#k...row dimension of V that is calculated, corresponds to dimension of orthogonal complement of B
#k0... number of arbitrary starting values
#p...fraction of data points used as shifting point
#maxIter... number of maximal iterations in curvilinear search
#nObs.. nObs parameter for choosing bandwidth if no h is supplied
#lambda_0...initial stepsize
#tol...tolerance for stoping iterations
#sclack_para...if relative improvment is worse than sclack_para the stepsize is reduced
#output:
#est_base...Vhat_k= argmin_V:V'V=I_k L_n(V) a (dim times k) matrix where dim is row-dimension of X data matrix
#var...value of L_n(Vhat_k)
#aov_dat... (L_tilde_n(Vhat_k,X_i))_{i=1,..,N}
#count...number of iterations 
#h...bandwidth
cve_R <- function(
        X, Y, k,
        nObs = sqrt(nrow(X)),
        tauInitial = 1.0,
        tol = 1e-3,
        slack = 0,
        maxIter = 50L,
        attempts = 10L
) {
    # get dimensions
    n <- nrow(X)
    p <- ncol(X)
    q <- p - k

    Xl <- kronecker(rep(1, n), X) - kronecker(X, rep(1, n))
    Xd <- apply(Xl, 1, norm2)
    I_p <- diag(1, p)
    # estimate bandwidth
    h <- estimateBandwidth(X, k, nObs)

    Lbest <- Inf
    Vend <- mat.or.vec(p, q)

    for (. in 1:attempts) {
        Vnew <- Vold <- rStiefl(p, q)
        Lnew <- Lold <- exp(10000)
        tau <- tauInitial

        error <- Inf
        count <- 0
        while (error > tol & count < maxIter) {

            tmp <- LV(Vold, Xl, Xd, h, n, Y)
            G <- tmp$grad
            Lold <- tmp$var

            W <- tau * (G %*% t(Vold) - Vold %*% t(G))
            Vnew <- solve(I_p + W) %*% (I_p - W) %*% Vold
            
            Lnew <- LV(Vnew, Xl, Xd, h, n, Y, grad = FALSE)$var

            if ((Lnew - Lold) > slack * Lold) {
                tau = tau / 2
                error <- Inf
            } else {
                error <- norm(Vold %*% t(Vold) - Vnew %*% t(Vnew), "F") / sqrt(2 * k)
                Vold <- Vnew
            }
            count <- count + 1
        }

        if (Lbest > Lnew) {
            Lbest <- Lnew
            Vend <- Vnew
        }
    }

    return(list(
        loss = Lbest,
        V = Vend,
        B = null(Vend),
        h = h
    ))
}