CVE/cve_V2.cpp

//
// Development file.
//
// Usage:
//  ~$ R -e "library(Rcpp); Rcpp::sourceCpp('cve_V2.cpp')"
//

// only `RcppArmadillo.h` which includes `Rcpp.h`
#include <RcppArmadillo.h>

// through the depends attribute `Rcpp` is tolled to create
// hooks for `RcppArmadillo` needed by the build process.
//
// [[Rcpp::depends(RcppArmadillo)]]

// required for `R::qchisq()` used in `estimateBandwidth()`
#include <Rmath.h>

//' Estimated bandwidth for CVE.
//'
//' Estimates a propper bandwidth \code{h} according
//' \deqn{h = \chi_{p-q}^{-1}\left(\frac{nObs - 1}{n-1}\right)\frac{2 tr(\Sigma)}{p}}{%
//'       h = qchisq( (nObs - 1)/(n - 1), p - q ) 2 tr(Sigma) / p}
//'
//' @param X data matrix of dimension (n x p) with n data points X_i of dimension
//'  q. Therefor each row represents a datapoint of dimension p.
//' @param k Guess for rank(B).
//' @param nObs Ether numeric of a function. If specified as numeric value
//'  its used in the computation of the bandwidth directly. If its a function
//'  `nObs` is evaluated as \code{nObs(nrow(x))}. The default behaviou if not
//'  supplied at all is to use \code{nObs <- nrow(x)^0.5}.
//'
//' @seealso [qchisq()]
//'
//' @export
// [[Rcpp::export]]
double estimateBandwidth(const arma::mat& X, arma::uword k, double nObs) {
    using namespace arma;

    uword n = X.n_rows; // nr samples
    uword p = X.n_cols; // dimension of rand. var. `X`

    // column mean
    mat M = mean(X);
    // center `X`
    mat C = X.each_row() - M;
    // trace of covariance matrix, `traceSigma = Tr(C' C)`
    double traceSigma = accu(C % C);
    // compute estimated bandwidth
    double qchi2 = R::qchisq((nObs - 1.0) / (n - 1), static_cast<double>(k), 1, 0);

    return 2.0 * qchi2 * traceSigma / (p * n);
}

//' Random element from Stiefel Manifold `S(p, q)`.
//'
//' Draws an element of \eqn{S(p, q)} which is the Stiefel Manifold.
//' This is done by taking the Q-component of the QR decomposition
//' from a `(p, q)` Matrix with independent standart normal entries.
//' As a semi-orthogonal Matrix the result `V` satisfies \eqn{V'V = I_q}.
//'
//' @param p Row dimension
//' @param q Column dimension
//'
//' @return Matrix of dim `(p, q)`.
//'
//' @seealso <https://en.wikipedia.org/wiki/Stiefel_manifold>
//'
//' @export
// [[Rcpp::export]]
arma::mat rStiefel(arma::uword p, arma::uword q) {
    arma::mat Q, R;
    arma::qr_econ(Q, R, arma::randn<arma::mat>(p, q));
    return Q;
}

double gradient(const arma::mat& X,
                const arma::mat& X_diff,
                const arma::mat& Y,
                const arma::mat& Y_rep,
                const arma::mat& V,
                const double h,
                arma::mat* G = 0
) {
    using namespace arma;

    uword n = X.n_rows;
    uword p = X.n_cols;

    // orthogonal projection matrix `Q = I - VV'` for dist computation
    mat Q = -(V * V.t());
    Q.diag() += 1;
    // calc pairwise distances as `D` with `D(i, j) = d_i(V, X_j)`
    vec D_vec = sum(square(X_diff * Q), 1);
    mat D = reshape(D_vec, n, n);
    // calc weights as `W` with `W(i, j) = w_i(V, X_j)`
    mat W = exp(D / (-2.0 * h));
    // column-wise normalization via 1-norm
    W = normalise(W, 1);
    vec W_vec = vectorise(W);
    // weighted `Y` means (first and second order)
    vec y1 = W.t() * Y;
    vec y2 = W.t() * square(Y);
    // loss for each `X_i`, meaning `L(i) = L_n(V, X_i)`
    vec L = y2 - square(y1);
    // `loss = L_n(V)`
    double loss = mean(L);
    // check if gradient as output variable is set
    if (G != 0) {
        // `G = grad(L_n(V))` a.k.a. gradient of `L` with respect to `V`
        vec scale = (repelem(L, n, 1) - square(Y_rep - repelem(y1, n, 1))) % W_vec % D_vec;
        mat X_diff_scale = X_diff.each_col() % scale;
        (*G) = X_diff_scale.t() * X_diff * V;
        (*G) *= -2.0 / (h * h * n);
    }

    return loss;
}

//' Stiefel Optimization with curvilinear linesearch.
//'
//' TODO: finish doc. comment
//' Stiefel Optimization for \code{V} given a dataset \code{X} and responces
//' \code{Y} for the model \deqn{Y\sim g(B'X) + \epsilon}{Y ~ g(B'X) + epsilon}
//' to compute the matrix `B` such that \eqn{span{B} = span(V)^{\bot}}{%
//' span(B) = orth(span(B))}.
//' The curvilinear linesearch uses Armijo-Wolfe conditions:
//      \deqn{L(V(tau)) > L(V(0)) + rho_1 * tau * L(V(0))'}
//'     \deqn{L(V(tau))' < rho_2 * L(V(0))'}
//'
//' @param X data points
//' @param Y response
//' @param k assumed \eqn{rank(B)}
//' @param nObs parameter for bandwidth estimation, typical value
//'     \code{nObs = nrow(X)^lambda} with \code{lambda} in the range [0.3, 0.8].
//' @param tau Initial step size
//' @param tol Tolerance for update error used for stopping criterion
//' @param maxIter Upper bound of optimization iterations
//'
//' @return List containing the bandwidth \code{h}, optimization objective \code{V}
//'     and the matrix \code{B} estimated for the model as a orthogonal basis of the
//'     orthogonal space spaned by \code{V}.
//'
//' @rdname optStiefel
//' @keywords internal
double optStiefel(
        const arma::mat& X,
        const arma::vec& Y,
        const int k,
        const double h,
        const double tauInitial,
        const double tol,
        const int maxIter,
        const double rho1,
        const double rho2,
        const int maxLineSeachIter,
        arma::mat& V,      // out
        arma::vec& history // out
) {
    using namespace arma;

    // get dimensions
    const uword n = X.n_rows; // nr samples
    const uword p = X.n_cols; // dim of random variable `X`
    const uword q = p - k;    // rank(V) e.g. dim of ortho space of span{B}

    // all `X_i - X_j` differences, `X_diff.row(i * n + j) = X_i - X_j`
    mat X_diff(n * n, p);
    for (uword i = 0, k = 0; i < n; ++i) {
        for (uword j = 0; j < n; ++j) {
            X_diff.row(k++) = X.row(i) - X.row(j);
        }
    }
    const vec Y_rep = repmat(Y, n, 1);
    const mat I_p = eye<mat>(p, p);
    const mat I_2q = eye<mat>(2 * q, 2 * q);

    // initial start value for `V`
    V = rStiefel(p, q);

    // first gradient initialization
    mat G;
    double loss = gradient(X, X_diff, Y, Y_rep, V, h, &G);

    // set first `loss` in history
    history(0) = loss;

    // main curvilinear optimization loop
    double error = datum::inf;
    int iter = 0;
    while (iter++ < maxIter && error > tol) {
        // helper matrices `lU` (linesearch U), `lV` (linesearch V)
        // as describet in [Wen, Yin] Lemma 4.
        mat lU = join_rows(G, V); // linesearch "U"
        mat lV = join_rows(V, -1.0 * G); // linesearch "V"
        // `A = G V' - V G'`
        mat A = lU * lV.t();

        // set initial step size for curvilinear line search
        double tau = tauInitial, lower = 0., upper = datum::inf;

        // check if `tau` is valid for inverting

        // set line search internal gradient and loss to cycle for next iteration
        mat V_tau; // next position after a step of size `tau`, a.k.a. `V(tau)`
        mat G_tau; // gradient of `V` at `V(tau) = V_tau`
        double loss_tau; // loss (objective) at position `V(tau)`
        int lsIter = 0; // linesearch iter
        // start line search
        do {
            mat HV = inv(I_2q + (tau/2.) * lV.t() * lU) * lV.t();
            // next step `V`
            V_tau = V - tau * (lU * (HV * V));

            double LprimeV = -trace(G.t() * A * V);

            mat lB = I_p - (tau / 2.) * lU * HV;

            loss_tau = gradient(X, X_diff, Y, Y_rep, V_tau, h, &G_tau);

            double LprimeV_tau = -2. * trace(G_tau.t() * lB * A * (V + V_tau));

            // Armijo condition
            if (loss_tau > loss + (rho1 * tau * LprimeV)) {
                upper = tau;
                tau = (lower + upper) / 2.;
            // Wolfe condition
            } else if (LprimeV_tau < rho2 * LprimeV) {
                lower = tau;
                if (upper == datum::inf) {
                    tau = 2. * lower;
                } else {
                    tau = (lower + upper) / 2.;
                }
            } else {
                break;
            }
        } while (++lsIter < maxLineSeachIter);

        // compute error (break condition)
        // Note: `error` is the decrease of the objective `L_n(V)` and not the
        //       norm of the gradient as proposed in [Wen, Yin] Algorithm 1.
        error = loss - loss_tau;

        // cycle `V`, `G` and `loss` for next iteration
        V = V_tau;
        loss = loss_tau;
        G = G_tau;

        // store final `loss`
        history(iter) = loss;

    }

    return loss;
}


//' Conditional Variance Estimation (CVE) method.
//'
//' This version uses a curvilinear linesearch for the stiefel optimization.
//'
//' @param X data points
//' @param Y response
//' @param k assumed \eqn{rank(B)}
//' @param nObs parameter for bandwidth estimation, typical value
//'     \code{nObs = nrow(X)^lambda} with \code{lambda} in the range [0.3, 0.8].
//' @param tau Initial step size (default 1)
//' @param tol Tolerance for update error used for stopping criterion (default 1e-5)
//' @param slack Ratio of small negative error allowed in loss optimization (default -1e-10)
//' @param maxIter Upper bound of optimization iterations (default 50)
//' @param attempts Number of tryes with new random optimization starting points (default 10)
//'
//' @return List containing the bandwidth \code{h}, optimization objective \code{V}
//'     and the matrix \code{B} estimated for the model as a orthogonal basis of the
//'     orthogonal space spaned by \code{V}.
//'
//' @rdname cve_cpp_V2
//' @export
// [[Rcpp::export]]
Rcpp::List cve_cpp(
        const arma::mat& X,
        const arma::vec& Y,
        const int k,
        const double nObs,
        const double tauInitial = 1.,
        const double rho1 = 0.05,
        const double rho2 = 0.95,
        const double tol = 1e-6,
        const int maxIter = 50,
        const int maxLineSeachIter = 10,
        const int attempts = 10
) {
    using namespace arma;

    // tracker of current best results
    mat V_best;
    double loss_best = datum::inf;
    // estimate bandwidth
    double h = estimateBandwidth(X, k, nObs);

    // loss history for each optimization attempt
    // each column contaions the iteration history for the loss
    mat history = mat(maxIter + 1, attempts);

    // multiple stiefel optimization attempts
    for (int i = 0; i < attempts; ++i) {
        // declare output variables
        mat V; // estimated `V` space
        vec hist = vec(history.n_rows, fill::zeros); // optimization history
        double loss = optStiefel(X, Y, k, h,
            tauInitial, tol, maxIter, rho1, rho2, maxLineSeachIter, V, hist
        );
        if (loss < loss_best) {
            loss_best = loss;
            V_best = V;
        }
        // write history to history collection
        history.col(i) = hist;
    }

    // get `B` as kernal of `V.t()`
    mat B = null(V_best.t());

    return Rcpp::List::create(
        Rcpp::Named("history") = history,
        Rcpp::Named("loss") = loss_best,
        Rcpp::Named("h") = h,
        Rcpp::Named("V") = V_best,
        Rcpp::Named("B") = B
    );
}

/*** R

source("CVE/R/datasets.R")
ds <- dataset()

print(system.time(
    cve.res <- cve_cpp(
        X = ds$X,
        Y = ds$Y,
        k = ncol(ds$B),
        nObs = sqrt(nrow(ds$X))
    )
))

pdf('plots/cve_V2_history.pdf')
H <- cve.res$history
H_i <- H[H[, 1] > 0, 1]
plot(1:length(H_i), H_i,
    main = "History cve_V2",
    xlab = "Iterations i",
    ylab = expression(loss == L[n](V^{(i)})),
    xlim = c(1, nrow(H)),
    ylim = c(0, max(H)),
    type = "l"
)
for (i in 2:ncol(H)) {
    H_i <- H[H[, i] > 0, i]
    lines(1:length(H_i), H_i)
}
x.ends <- apply(H, 2, function(h) { length(h[h > 0]) })
y.ends <- apply(H, 2, function(h) { min(h[h > 0]) })
points(x.ends, y.ends)

*/