#include "cve.h" // TODO: clarify static inline double gaussKernel(const double x, const double scale) { return exp(scale * x * x); } void cve_sub(const int n, const int p, const int q, const double *X, const double *Y, const double h, const unsigned int method, const double momentum, const double tau_init, const double tol_init, const double slack, const double gamma, const int epochs, const int attempts, double *V, double *L, SEXP logger, SEXP loggerEnv) { int attempt = 0, epoch, i, nn = (n * (n - 1)) / 2; double loss, loss_last, loss_best, err, tau; double tol = tol_init * sqrt((double)(2 * q)); double gKscale = -0.5 / h; double agility = -2.0 * (1.0 - momentum) / (h * h); double c; /* Create further intermediate or internal variables. */ double *Q = (double*)R_alloc(p * p, sizeof(double)); double *V_best = (double*)R_alloc(p * q, sizeof(double)); double *L_best = (double*)R_alloc(n, sizeof(double)); double *V_tau = (double*)R_alloc(p * q, sizeof(double)); double *X_diff = (double*)R_alloc(nn * p, sizeof(double)); double *X_proj = (double*)R_alloc(nn * p, sizeof(double)); double *y1 = (double*)R_alloc(n, sizeof(double)); double *vecD = (double*)R_alloc(nn, sizeof(double)); double *vecK = (double*)R_alloc(nn, sizeof(double)); double *vecS = (double*)R_alloc(nn, sizeof(double)); double *colSums = (double*)R_alloc(n, sizeof(double)); double *G = (double*)R_alloc(p * q, sizeof(double)); double *A = (double*)R_alloc(p * p, sizeof(double)); double *V_init = (void*)0; if (attempts < 1) { V_init = (double*)R_alloc(p * q, sizeof(double)); memcpy(V_init, V, p * q * sizeof(double)); } /* Determine size of working memory used by subroutines. */ const int workLen = getWorkLen(n, p, q); double *workMem = (double*)R_alloc(workLen, sizeof(double)); /* Compute X_diff, this is static for the entire algorithm. */ rowDiffs(X, n, p, X_diff); do { /* (Re)set learning rate. */ tau = tau_init; /* Check if start value for `V` was supplied. */ if (V_init == (void*)0) { /* Sample start value from stiefel manifold. */ rStiefel(p, q, V, workMem, workLen); } else { /* (Re)Set start value of `V` to `V_init`. */ memcpy(V, V_init, p * q * sizeof(double)); } /* Create projection matrix `Q <- I - V V^T` for initial `V`. */ nullProj(V, p, q, Q); /* Compute Distance vector. */ matrixprod(X, n, p, Q, p, p, X_proj); // here X_proj is only `(n, p)`. rowDiffSquareSums(X_proj, n, p, vecD); /* Apply kernel to distances. */ for (i = 0; i < nn; ++i) { vecK[i] = gaussKernel(vecD[i], gKscale); } /* Compute col(row) sums of kernal vector (sym. packed lower tri * matrix.), because `K == K^T` the rowSums are equal to colSums. */ rowSumsSymVec(vecK, n, 1.0, colSums); /* Compute loss given the kernel vector and its column sums. * Additionally the first momentum `y1` is computed and stored in * the working memory (only intermidiate result, needed for `vecS`). */ loss_last = cost(method, n, Y, vecK, colSums, y1, L); if (logger) { callLogger(logger, loggerEnv, attempt, 0, L, n, V, p, q, tau); } /* Calc the scaling vector used for final computation of grad. */ scaling(method, n, Y, y1, L, vecD, vecK, colSums, vecS); /* Compute the eucledian gradient `G`. */ rowSweep(X_diff, nn, p, "*", vecS, X_proj); crossprod(X_diff, nn, p, X_proj, nn, p, workMem); matrixprod(workMem, p, p, V, p, q, G); if (method == CVE_METHOD_WEIGHTED) { /* Compute summ of all kernel applied distances by summing the * colSums of the kernel matrix. */ c = -(double)n; // to scale with sum(K) - n for (i = 0; i < n; ++i) { c += colSums[i]; } // TODO: check for division by zero, but should not happen!!! } else { c = n; // TODO: move (init) up cause always the same ^^ ... } scale(agility / c, G, p * q); // in-place /* Compute Skew-Symmetric matrix `A` used in Cayley transform. * `A <- tau * (G V^T - V G^T) + 0 * A`*/ skew(p, q, tau, G, V, 0.0, A); for (epoch = 0; epoch < epochs; ++epoch) { /* Move V allong A */ cayleyTransform(p, q, A, V, V_tau, workMem); /* Create projection matrix for `V_tau`. */ nullProj(V_tau, p, q, Q); /* Compute Distance vector. */ matrixprod(X, n, p, Q, p, p, X_proj); // here X_proj only `(n, p)`. rowDiffSquareSums(X_proj, n, p, vecD); /* Apply kernel to distances. */ for (i = 0; i < nn; ++i) { vecK[i] = gaussKernel(vecD[i], gKscale); } /* Compute col(row) sums of kernal vector (sym. packed lower tri * matrix.), because `K == K^T` the rowSums are equal to colSums. */ rowSumsSymVec(vecK, n, 1.0, colSums); /* Compute loss given the kernel vector and its column sums. * Additionally the first momentum `y1` is computed and stored in * the working memory (only intermidiate result, needed for vecS).*/ loss = cost(method, n, Y, vecK, colSums, y1, L); /* Check if step is appropriate, iff not reduce learning rate. */ if ((loss - loss_last) > loss_last * slack) { tau *= gamma; scale(gamma, A, p * p); continue; } // Compute error, use workMem (keep first `n`, they store `y1`). skew(p, q, 1.0, V, V_tau, 0.0, workMem); err = norm(workMem, p, p, "F"); // Shift next step to current step and store loss to last. memcpy(V, V_tau, p * q * sizeof(double)); loss_last = loss; if (logger) { callLogger(logger, loggerEnv, attempt, epoch + 1, L, n, V, p, q, tau); } // Check Break condition. if (err < tol || epoch + 1 >= epochs) { break; } /* Continue computing the gradient. */ /* Calc the scaling vector used for final computation of grad. */ scaling(method, n, Y, y1, L, vecD, vecK, colSums, vecS); /* Compute the eucledian gradient `G`. */ rowSweep(X_diff, nn, p, "*", vecS, X_proj); crossprod(X_diff, nn, p, X_proj, nn, p, workMem); // /* Update without momentum */ // matrixprod(workMem, p, p, V, p, q, G); // scale(-2. / (((double)n) * h * h), G, p * q); // in-place /* G <- momentum * G + agility * workMem V */ if (method == CVE_METHOD_WEIGHTED) { /* Compute summ of all kernel applied distances by summing the * colSums of the kernel matrix. */ c = -(double)n; // to scale with sum(K) - n for (i = 0; i < n; ++i) { c += colSums[i]; } c = agility / c; // TODO: check for division by zero, but should not happen!!! } else { c = agility / n; } F77_NAME(dgemm)("N", "N", &p, &q, &p, &c, workMem, &p, V, &p, &momentum, G, &p); /* Compute Skew-Symmetric matrix `A` used in Cayley transform. * `A <- tau * (G V^T - V G^T) + 0 * A`*/ skew(p, q, tau, G, V, 0.0, A); } /* Check if current attempt improved previous ones */ if (attempt == 0 || loss < loss_best) { loss_best = loss; memcpy(V_best, V, p * q * sizeof(double)); memcpy(L_best, L, n * sizeof(double)); } } while (++attempt < attempts); memcpy(V, V_best, p * q * sizeof(double)); memcpy(L, L_best, n * sizeof(double)); }