#include "cve.h"

// TODO: clarify
static inline double gaussKernel(const double x, const double scale) {
    return exp(scale * x * x);
}

void cve_sub(const int n, const int p, const int q,
             const double *X, const double *Y, const double h,
             const unsigned int method,
             const double momentum,
             const double tau_init, const double tol_init,
             const double slack, const double gamma,
             const int epochs, const int attempts,
             double *V, double *L,
             SEXP logger, SEXP loggerEnv) {

    int attempt = 0, epoch, i, nn = (n * (n - 1)) / 2;
    double loss, loss_last, loss_best, err, tau;
    double tol = tol_init * sqrt((double)(2 * q));
    double gKscale = -0.5 / h;
    double agility = -2.0 * (1.0 - momentum) / (h * h);
    double c;

    /* Create further intermediate or internal variables. */
    double *Q       = (double*)R_alloc(p * p,  sizeof(double));
    double *V_best  = (double*)R_alloc(p * q,  sizeof(double));
    double *L_best  = (double*)R_alloc(n,      sizeof(double));
    double *V_tau   = (double*)R_alloc(p * q,  sizeof(double));
    double *X_diff  = (double*)R_alloc(nn * p, sizeof(double));
    double *X_proj  = (double*)R_alloc(nn * p, sizeof(double));
    double *y1      = (double*)R_alloc(n,      sizeof(double));
    double *vecD    = (double*)R_alloc(nn,     sizeof(double));
    double *vecK    = (double*)R_alloc(nn,     sizeof(double));
    double *vecS    = (double*)R_alloc(nn,     sizeof(double));
    double *colSums = (double*)R_alloc(n,      sizeof(double));
    double *G       = (double*)R_alloc(p * q,  sizeof(double));
    double *A       = (double*)R_alloc(p * p,  sizeof(double));

    double *V_init = (void*)0;
    if (attempts < 1) {
        V_init = (double*)R_alloc(p * q, sizeof(double));
        memcpy(V_init, V, p * q * sizeof(double));
    }

    /* Determine size of working memory used by subroutines. */
    const int workLen = getWorkLen(n, p, q);
    double *workMem = (double*)R_alloc(workLen, sizeof(double));

    /* Compute X_diff, this is static for the entire algorithm. */
    rowDiffs(X, n, p, X_diff);

    do {
        /* (Re)set learning rate. */
        tau = tau_init;

        /* Check if start value for `V` was supplied. */
        if (V_init == (void*)0) {
            /* Sample start value from stiefel manifold. */
            rStiefel(p, q, V, workMem, workLen);
        } else {
            /* (Re)Set start value of `V` to `V_init`. */
            memcpy(V, V_init, p * q * sizeof(double));
        }

        /* Create projection matrix `Q <- I - V V^T` for initial `V`. */
        nullProj(V, p, q, Q);

        /* Compute Distance vector. */
        matrixprod(X, n, p, Q, p, p, X_proj); // here X_proj is only `(n, p)`.
        rowDiffSquareSums(X_proj, n, p, vecD);

        /* Apply kernel to distances. */
        for (i = 0; i < nn; ++i) {
            vecK[i] = gaussKernel(vecD[i], gKscale);
        }

        /* Compute col(row) sums of kernal vector (sym. packed lower tri
         * matrix.), because `K == K^T` the rowSums are equal to colSums. */
        rowSumsSymVec(vecK, n, 1.0, colSums);

        /* Compute loss given the kernel vector and its column sums.
         * Additionally the first momentum `y1` is computed and stored in
         * the working memory (only intermidiate result, needed for `vecS`). */
        loss_last = cost(method, n, Y, vecK, colSums, y1, L);

        if (logger) {
            callLogger(logger, loggerEnv,
                       attempt, 0,
                       L, n, V, p, q, tau);
        }

        /* Calc the scaling vector used for final computation of grad. */
        scaling(method, n, Y, y1, L, vecD, vecK, colSums, vecS);

        /* Compute the eucledian gradient `G`. */
        rowSweep(X_diff, nn, p, "*", vecS, X_proj);
        crossprod(X_diff, nn, p, X_proj, nn, p, workMem);
        matrixprod(workMem, p, p, V, p, q, G);
        if (method == CVE_METHOD_WEIGHTED) {
            /* Compute summ of all kernel applied distances by summing the
             * colSums of the kernel matrix. */
            c = -(double)n; // to scale with sum(K) - n
            for (i = 0; i < n; ++i) {
                c += colSums[i];
            }
            // TODO: check for division by zero, but should not happen!!!
        } else {
            c = n; // TODO: move (init) up cause always the same ^^ ...
        }
        scale(agility / c, G, p * q); // in-place

        /* Compute Skew-Symmetric matrix `A` used in Cayley transform.
         * `A <- tau * (G V^T - V G^T) + 0 * A`*/
        skew(p, q, tau, G, V, 0.0, A);

        for (epoch = 0; epoch < epochs; ++epoch) {
            /* Move V allong A */
            cayleyTransform(p, q, A, V, V_tau, workMem);

            /* Create projection matrix for `V_tau`. */
            nullProj(V_tau, p, q, Q);

            /* Compute Distance vector. */
            matrixprod(X, n, p, Q, p, p, X_proj); // here X_proj only `(n, p)`.
            rowDiffSquareSums(X_proj, n, p, vecD);

            /* Apply kernel to distances. */
            for (i = 0; i < nn; ++i) {
                vecK[i] = gaussKernel(vecD[i], gKscale);
            }

            /* Compute col(row) sums of kernal vector (sym. packed lower tri
             * matrix.), because `K == K^T` the rowSums are equal to colSums. */
            rowSumsSymVec(vecK, n, 1.0, colSums);

            /* Compute loss given the kernel vector and its column sums.
             * Additionally the first momentum `y1` is computed and stored in
             * the working memory (only intermidiate result, needed for vecS).*/
            loss = cost(method, n, Y, vecK, colSums, y1, L);

            /* Check if step is appropriate, iff not reduce learning rate. */
            if ((loss - loss_last) > loss_last * slack) {
                tau *= gamma;
                scale(gamma, A, p * p);
                continue;
            }

            // Compute error, use workMem (keep first `n`, they store `y1`).
            skew(p, q, 1.0, V, V_tau, 0.0, workMem);
            err = norm(workMem, p, p, "F");

            // Shift next step to current step and store loss to last.
            memcpy(V, V_tau, p * q * sizeof(double));
            loss_last = loss;

            if (logger) {
                callLogger(logger, loggerEnv,
                           attempt, epoch + 1,
                           L, n, V, p, q, tau);
            }

            // Check Break condition.
            if (err < tol || epoch + 1 >= epochs) {
                break;
            }

            /* Continue computing the gradient. */
            /* Calc the scaling vector used for final computation of grad. */
            scaling(method, n, Y, y1, L, vecD, vecK, colSums, vecS);

            /* Compute the eucledian gradient `G`. */
            rowSweep(X_diff, nn, p, "*", vecS, X_proj);
            crossprod(X_diff, nn, p, X_proj, nn, p, workMem);
            //     /* Update without momentum */
            //     matrixprod(workMem, p, p, V, p, q, G);
            //     scale(-2. / (((double)n) * h * h), G, p * q); // in-place
            /* G <- momentum * G + agility * workMem V */

            if (method == CVE_METHOD_WEIGHTED) {
                /* Compute summ of all kernel applied distances by summing the
                * colSums of the kernel matrix. */
                c = -(double)n; // to scale with sum(K) - n
                for (i = 0; i < n; ++i) {
                    c += colSums[i];
                }
                c = agility / c;
                // TODO: check for division by zero, but should not happen!!!
            } else {
                c = agility / n;
            }
            F77_NAME(dgemm)("N", "N", &p, &q, &p,
                            &c, workMem, &p, V, &p,
                            &momentum, G, &p);

            /* Compute Skew-Symmetric matrix `A` used in Cayley transform.
             * `A <- tau * (G V^T - V G^T) + 0 * A`*/
            skew(p, q, tau, G, V, 0.0, A);
        }

        /* Check if current attempt improved previous ones */
        if (attempt == 0 || loss < loss_best) {
            loss_best = loss;
            memcpy(V_best, V, p * q * sizeof(double));
            memcpy(L_best, L, n * sizeof(double));
        }
    } while (++attempt < attempts);

    memcpy(V, V_best, p * q * sizeof(double));
    memcpy(L, L_best, n * sizeof(double));
}