//
// Usage (bash):
// ~$ R -e "library(Rcpp); sourceCpp('runtime_tests_grad.cpp')"
//
// Usage (R):
// > library(Rcpp)
// > sourceCpp('runtime_tests_grad.cpp')
//

// // #define ARMA_DONT_USE_WRAPPER
// #define ARMA_DONT_USE_BLAS

// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>
#include <math.h>

// [[Rcpp::export]]
arma::mat arma_grad(const arma::mat& X,
                    const arma::mat& X_diff,
                    const arma::vec& Y,
                    const arma::vec& Y_rep,
                    const arma::mat& V,
                    const double h) {
    using namespace arma;

    uword n = X.n_rows;
    uword p = X.n_cols;

    // orthogonal projection matrix `Q = I - VV'` for dist computation
    mat Q = -(V * V.t());
    Q.diag() += 1;
    // calc pairwise distances as `D` with `D(i, j) = d_i(V, X_j)`
    vec D_vec = sum(square(X_diff * Q), 1);
    mat D = reshape(D_vec, n, n);
    // calc weights as `W` with `W(i, j) = w_i(V, X_j)`
    mat W = exp(D / (-2.0 * h));
    W = normalise(W, 1); // colomn-wise, 1-norm
    vec W_vec = vectorise(W);
    // centered weighted `Y` means
    vec y1 = W.t() * Y;
    vec y2 = W.t() * square(Y);
    // loss for each X_i, meaning `L(i) = L_n(V, X_i)`
    vec L = y2 - square(y1);
    // "global" loss
    double loss = mean(L);
    // `G = \nabla_V L_n(V)` a.k.a. gradient of `L` with respect to `V`
    vec scale = (repelem(L, n, 1) - square(Y_rep - repelem(y1, n, 1))) % W_vec % D_vec;
    mat X_diff_scale = X_diff.each_col() % scale;
    mat G = X_diff_scale.t() * X_diff * V;
    G *= -2.0 / (h * h * n);

    return G;
}

// ATTENTION: assumed `X` stores X_i's column wise, `X = cbind(X_1, X_2, ..., X_n)`
// [[Rcpp::export]]
arma::mat grad(const arma::mat& X,
               const arma::vec& Y,
               const arma::mat& V,
               const double h
) {
    using namespace arma;

    // get dimensions
    const uword n = X.n_cols;
    const uword p = X.n_rows;
    const uword q = V.n_cols;

    // compute orthogonal projection
    mat Q = -(V * V.t());
    Q.diag() += 1.0;

    // distance matrix `D(i, j) = d_i(V, X_j)`
    mat D(n, n, fill::zeros);
    // weights matrix `W(i, j) = w_i(V, X_j)`
    mat W(n, n, fill::ones); // exp(0) = 1

    double mvm = 0.0; // Matrix Vector Mult.
    double sos = 0.0; // Sum Of Squares
    // double wcn = 0.0; // Weights Column Norm
    for (uword j = 0; j + 1 < n; ++j) {
        for (uword i = j + 1; i < n; ++i) {
            sos = 0.0;
            for (uword k = 0; k < p; ++k) {
                mvm = 0.0;
                for (uword l = 0; l < p; ++l) {
                    mvm += Q(k, l) * (X(l, i) - X(l, j));
                }
                sos += mvm * mvm;
            }
            D(i, j) = D(j, i) = sos;
            W(i, j) = W(j, i) = std::exp(sos / (-2. * h));
        }
    }

    // column normalization of weights `W`
    double col_sum;
    for (uword j = 0; j < n; ++j) {
        col_sum = 0.0;
        for (uword i = 0; i < n; ++i) {
            col_sum += W(i, j);
        }
        for (uword i = 0; i < n; ++i) {
            W(i, j) /= col_sum;
        }
    }

    // weighted first, second order responce means `y1`, `y2`
    // and component wise Loss `L(i) = L_n(V, X_i)`
    vec y1(n);
    vec y2(n);
    vec L(n);
    double tmp;
    double loss = 0.0;
    for (uword i = 0; i < n; ++i) {
        mvm = 0.0; // Matrix Vector Mult.
        sos = 0.0; // Sum Of Squared (weighted)
        for (uword k = 0; k < n; ++k) {
            mvm += (tmp = W(k, i) * Y(k));
            sos += tmp * Y(k); // W(k, i) * Y(k) * Y(k)
        }
        y1(i) = mvm;
        y2(i) = sos;
        loss += (L(i) = sos - (mvm * mvm)); // L_n(V, X_i) = y2(i) - y1(i)^2
    }
    loss /= n;

    mat S(n, n);
    for (uword k = 0; k < n; ++k) {
        for (uword l = 0; l < n; ++l) {
            tmp = Y(k) - y1(l);
            S(k, l) = (L(l) - (tmp * tmp)) * W(k, l) * D(k, l);
        }
    }

    // gradient
    mat G(p, q);
    double factor = -2. / (n * h * h);
    double gij;
    for (uword j = 0; j < q; ++j) {
        for (uword i = 0; i < p; ++i) {
            gij = 0.0;
            for (uword k = 0; k < n; ++k) {
                for (uword l = 0; l < n; ++l) {
                    mvm = 0.0;
                    for (uword m = 0; m < p; ++m) {
                        mvm += (X(m, l) - X(m, k)) * V(m, j);
                    }
                    // gij += (S(k, l) + S(l, k)) * (X(i, l) - X(i, k));
                    gij += S(k, l) * (X(i, l) - X(i, k)) * mvm;
                }
            }
            G(i, j) = factor * gij;
        }
    }

    return G;
}

// ATTENTION: assumed `X` stores X_i's column wise, `X = cbind(X_1, X_2, ..., X_n)`
// [[Rcpp::export]]
arma::mat grad_p(const arma::mat& X_ref,
                 const arma::vec& Y_ref,
                 const arma::mat& V_ref,
                 const double h
) {
    using arma::uword;

    // get dimensions
    const uword n = X_ref.n_cols;
    const uword p = X_ref.n_rows;
    const uword q = V_ref.n_cols;

    const double* X = X_ref.memptr();
    const double* Y = Y_ref.memptr();
    const double* V = V_ref.memptr();

    // allocate memory for entire algorithm
    //              Q (p,p)   D+W+S (n,n)   y1+L (n)  G (p,q)
    uword memsize = (p * p) + (3 * n * n) + (2 * n) + (p * q);
    double* mem = static_cast<double*>(malloc(sizeof(double) * memsize));

    // assign pointer to memory associated memory area
    double* Q = mem;
    double* D = Q + (p * p);
    double* W = D + (n * n);
    double* S = W + (n * n);
    double* y1 = S + (n * n);
    double* L = y1 + n;
    double* G = L + n;

    // compute orthogonal projection
    double sum;
    // compute orthogonal projection `Q = I_p - V V'`
    for (uword j = 0; j < p; ++j) {
        for (uword i = j; i < p; ++i) {
            sum = 0.0;
            for (uword k = 0; k < q; ++k) {
                sum += V[k * p + i] * V[k * p + j];
            }
            if (i == j) {
                Q[j * p + i] = 1.0 - sum;
            } else {
                Q[j * p + i] = Q[i * p + j] = -sum;
            }
        }
    }

    // set `diag(D) = 0` and `diag(W) = 1`.
    for (uword i = 0; i < n * n; i += n + 1) {
        D[i] = 0.0;
        W[i] = 1.0;
    }
    // components (using symmetrie) of `D` and `W` (except `diag`)
    double mvm = 0.0; // Matrix Vector Mult.
    for (uword j = 0; j + 1 < n; ++j) {
        for (uword i = j + 1; i < n; ++i) {
            sum = 0.0;
            for (uword k = 0; k < p; ++k) {
                mvm = 0.0;
                for (uword l = 0; l < p; ++l) {
                    mvm += Q[l * p + k] * (X[i * p + l] - X[j * p + l]);
                }
                sum += mvm * mvm;
            }
            D[j * n + i] = D[i * n + j] = sum;
            W[j * n + i] = W[i * n + j] = std::exp(sum / (-2. * h));
        }
    }

    // column normalization of weights `W`
    for (uword j = 0; j < n; ++j) {
        sum = 0.0;
        for (uword i = 0; i < n; ++i) {
            sum += W[j * n + i];
        }
        for (uword i = 0; i < n; ++i) {
            W[j * n + i] /= sum;
        }
    }
    // weighted first, secend order responce means `y1`, `y2`
    // and component wise Loss `L(i) = L_n(V, X_i)`
    double tmp;
    double loss = 0.0;
    for (uword i = 0; i < n; ++i) {
        mvm = 0.0; // Matrix Vector Mult.
        sum = 0.0; // Sum Of (weighted) Squares
        for (uword k = 0; k < n; ++k) {
            mvm += (tmp = W[i * n + k] * Y[k]);
            sum += tmp * Y[k];
        }
        y1[i] = mvm;
        loss += (L[i] = sum - (mvm * mvm));
    }
    loss /= n;

    // scaling for gradient summation
    // this scaling matrix is the lower triangular matrix defined as
    //
    // S_kl := (s_{kl} + s_{lk}) D_{kl}
    // s_ij := (L_n(V, X_j) - (Y_i - y1(V, X_j))^2) W_ij
    double factor;
    for (uword l = 0; l < n; ++l) {
        for (uword k = l + 1; k < n; ++k) {
            tmp = Y[k] - y1[l];
            factor = (L[l] - (tmp * tmp)) * W[l * n + k]; // \tile{S}_{kl}
            tmp = Y[l] - y1[k];
            factor += (L[k] - (tmp * tmp)) * W[k * n + l]; // \tile{S}_{lk}
            S[l * n + k] = factor * D[l * n + k]; // (s_kl + s_lk) * D_kl
        }
    }

    // gradient
    // reuse memory area of `Q`
    // no longer needed and provides enough space (`q < p`)
    double* GD = Q;
    const double* X_l;
    const double* X_k;
    for (uword j = 0; j < p; ++j) {
        for (uword i = 0; i < p; ++i) {
            sum = 0.0;
            for (uword l = 0; l < n; ++l) {
                X_l = X + (l * p);
                for (uword k = l + 1; k < n; ++k) {
                    X_k = X + (k * p);
                    sum += S[l * n + k] * (X_l[i] - X_k[i]) * (X_l[j] - X_k[j]);
                }
            }
            GD[j * p + i] = sum;
        }
    }

    // distance gradient `DG` to gradient by multiplying with `V`
    factor = -2. / (n * h * h);
    for (uword i = 0; i < p; ++i) {
        for (uword j = 0; j < q; ++j) {
            sum = 0.0;
            for (uword k = 0; k < p; ++k) {
                sum += GD[k * p + i] * V[j * p + k];
            }
            G[j * p + i] = factor * sum;
        }
    }

    // construct 'Armadillo' matrix from `G`s memory area
    arma::mat Grad(G, p, q);

    // free entire allocated memory block
    free(mem);

    return Grad;
}


/*** R

suppressMessages(library(microbenchmark))

cat("Start timing:\n")
time.start <- Sys.time()

rStiefl <- function(p, q) {
    return(qr.Q(qr(matrix(rnorm(p * q, 0, 1), p, q))))
}

## compare runtimes
n <- 200L
p <- 12L
q <- p - 2L
X <- matrix(rnorm(n * p), n, p)
Xt <- t(X)
X_diff <- kronecker(rep(1, n), X) - kronecker(X, rep(1, n))
Y <- rnorm(n)
Y_rep <- kronecker(rep(1, n), Y) # repmat(Y, n, 1)
h <- 1. / 4.;
V <- rStiefl(p, q)

# A <- arma_grad(X, X_diff, Y, Y_rep, V, h)
# G1 <- grad(Xt, Y, V, h)
# G2 <- grad_p(Xt, Y, V, h)
# 
# print(round(A[1:6, 1:6], 3))
# print(round(G1[1:6, 1:6], 3))
# print(round(G2[1:6, 1:6], 3))
# print(round(abs(A - G1), 9))
# print(round(abs(A - G2), 9))
# 
# q()


comp <- function (A, B, tol = sqrt(.Machine$double.eps)) {
    max(abs(A - B)) < tol
}
comp.all <- function (res) {
    if (length(res) < 2) {
        return(TRUE)
    }
    res.one = res[[1]]
    for (i in 2:length(res)) {
        if (!comp(res.one, res[[i]])) {
            return(FALSE)
        }
    }
    return(TRUE)
}
counter <- 0
setup.tests <- function () {
    if ((counter %% 3) == 0) {
        X <<- matrix(rnorm(n * p), n, p)
        Xt <<- t(X)
        X_diff <<- kronecker(rep(1, n), X) - kronecker(X, rep(1, n))
        Y <<- rnorm(n)
        Y_rep <<- kronecker(rep(1, n), Y) # arma::repmat(Y, n, 1)
        h <<- 1. / 4.;
        V <<- rStiefl(p, q)
    }
    counter <<- counter + 1
}
mbm <- microbenchmark(
    arma = arma_grad(X, X_diff, Y, Y_rep, V, h),
    # grad = grad(Xt, Y, V, h),
    grad_p = grad_p(Xt, Y, V, h),
    check = comp.all,
    setup = setup.tests(),
    times = 100L
)
cat("\033[1m\033[92mTotal time:", format(Sys.time() - time.start), '\n')
print(mbm)
cat("\033[0m")

boxplot(mbm, las = 2, xlab = NULL)

*/