wip: writing C gradient version

2019-09-12 18:42:28 +02:00 · 2019-09-12 18:42:28 +02:00 · 4d2651fe8a
commit 4d2651fe8a
parent 47917fe0bd
24 changed files with 1414 additions and 285 deletions
--- a/CVE/demo/runtime_test.R
+++ b/CVE/demo/runtime_test.R
@ -1,88 +1,71 @@
-# -----------------------------------------------------------------------------
-# Program: runtime_test.R
-# Author: Loki
-# Date: 2019.08.12
-#
-# Purpose:
-#      Comparing runtime of "MAVE" with "CVE".
-#
-# RevisionHistory:
-#      Loki -- 2019.08.12 initial creation
-# -----------------------------------------------------------------------------
+# Usage:
+# ~$ Rscript runtime_test.R

-# load CVE package
-library(CVE)
-# load MAVE package for comparison
-library(MAVE)
+library(CVE) # load CVE

-# set nr of simulations per dataset
-nr.sim <- 10
-
-# set names of datasets to run tests on
-dataset.names <- c("M1", "M2", "M3", "M4", "M5")
-
-#' Orthogonal projection to sub-space spanned by `B`
-#'
-#' @param B Matrix
-#' @return Orthogonal Projection Matrix
-proj <- function(B) {
-    B %*% solve(t(B) %*% B) %*% t(B)
+#' Writes progress to console.
+tell.user <- function(name, start.time, i, length) {
+    cat("\rRunning Test (", name, "):",
+        i, "/", length,
+        " - elapsed:", format(Sys.time() - start.time), "\033[K")
+}
+subspace.dist <- function(B1, B2){
+    P1 <- B1 %*% solve(t(B1) %*% B1) %*% t(B1)
+    P2 <- B2 %*% solve(t(B2) %*% B2) %*% t(B2)
+    return(norm(P1 - P2, type = 'F'))
 }

-#' Compute nObs given dataset dimension \code{n}.
-#'
-#' @param n Number of samples
-#' @return Numeric estimate of \code{nObs}
-nObs <- function (n) { n^0.5 }
+# Number of simulations
+SIM.NR <- 50
+# maximal number of iterations in curvilinear search algorithm
+MAXIT <- 50
+# number of arbitrary starting values for curvilinear optimization
+ATTEMPTS <- 10
+# set names of datasets
+dataset.names <- c("M1", "M2", "M3", "M4", "M5")
+# Set used CVE method
+methods <- c("simple") # c("legacy", "simple", "sgd", "linesearch")

-## prepare "logging"
-# result error, time, ... data.frame's
-error <- matrix(nrow = nr.sim, ncol = 2 * length(dataset.names))
-time <- matrix(nrow = nr.sim, ncol = 2 * length(dataset.names))
-# convert to data.frames
-error <- as.data.frame(error)
-time <- as.data.frame(time)
-# set names
-names(error) <- kronecker(c("CVE.", "MAVE."), dataset.names, paste0)
-names(time) <- kronecker(c("CVE.", "MAVE."), dataset.names, paste0)
+# Setup error and time tracking variables
+error <- matrix(NA, SIM.NR, length(methods) * length(dataset.names))
+time <- matrix(NA, SIM.NR, ncol(error))
+colnames(error) <- kronecker(paste0(dataset.names, '-'), methods, paste0)
+colnames(time) <- colnames(error)

-# get current time
+# only for telling user (to stdout)
+count <- 0
 start.time <- Sys.time()
-## main comparison loop (iterate `nr.sim` times for each dataset)
-for (i in seq_along(dataset.names)) {
-    for (j in 1:nr.sim) {
-        name <- dataset.names[i]
-        # reporting progress
-        cat("\rRunning Test (", name, j , "):",
-            (i - 1) * nr.sim + j, "/", length(dataset.names) * nr.sim,
-            " - Time since start:", format(Sys.time() - start.time), "\033[K")
-        # sample new dataset
+# Start simulation loop.
+for (sim in 1:SIM.NR) {
+    # Repeat for each dataset.
+    for (name in dataset.names) {
+        count <- count + 1
+        tell.user(name, start.time, count, SIM.NR * length(dataset.names))
+
+        # Create a new dataset
        ds <- dataset(name)
-        k <- ncol(ds$B) # real dim
-        data <- data.frame(X = ds$X, Y = ds$Y)
-        # call CVE
-        cve.time <- system.time(
-            cve.res <- cve(Y ~ .,
-                           data = data,
-                           method = "simple",
-                           k = k)
-        )
-        # call MAVE
-        mave.time <- system.time(
-            mave.res <- mave(Y ~ .,
-                             data = data,
-                             method = "meanMAVE")
-        )
-        # compute real and approximated sub-space projections
-        P <- proj(ds$B) # real
-        P.cve <- proj(cve.res$B)
-        P.mave <- proj(mave.res$dir[[k]])
-        # compute (and store) errors
-        error[j, paste0("CVE.", name)] <- norm(P - P.cve, 'F') / sqrt(2 * k)
-        error[j, paste0("MAVE.", name)] <- norm(P - P.mave, 'F') / sqrt(2 * k)
-        # store run-times
-        time[j, paste0("CVE.", name)] <- cve.time["elapsed"]
-        time[j, paste0("MAVE.", name)] <- mave.time["elapsed"]
+        # Prepare X, Y and combine to data matrix
+        Y <- ds$Y
+        X <- ds$X
+        data <- cbind(Y, X)
+        # get dimensions
+        dim <- ncol(X)
+        truedim <- ncol(ds$B)
+
+        for (method in methods) {
+            dr.time <- system.time(
+                dr <- cve.call(X, Y,
+                    method = method,
+                    k = truedim,
+                    attempts = ATTEMPTS
+                )
+            )
+            dr <- dr[[truedim]]
+
+            key <- paste0(name, '-', method)
+            error[sim, key] <- subspace.dist(dr$B, ds$B) / sqrt(2 * truedim)
+            time[sim, key] <- dr.time["elapsed"]
+        }
    }
 }

@ -91,16 +74,16 @@ print(colMeans(time))
 cat("\n## Error Means:\n")
 print(colMeans(error))

-len <- length(dataset.names)
-boxplot(as.matrix(error),
-    main = paste0("Error (nr.sim = ", nr.sim, ")"),
-    ylab = expression(error == group("||", P[B] - P[hat(B)], "||")[F] / sqrt(2*k)),
+at <- seq(ncol(error)) + rep(seq(ncol(error) / length(methods)) - 1, each = length(methods))
+boxplot(error,
+    main = paste0("Error (Nr of simulations ", SIM.NR, ")"),
+    ylab = "Error",
    las = 2,
-    at = c(1:len, 1:len + len + 1)
+    at = at
 )
-boxplot(as.matrix(time),
-    main = paste0("Time (nr.sim = ", nr.sim, ")"),
-    ylab = "time [sec]",
+boxplot(time,
+    main = paste0("Time (Nr of simulations ", SIM.NR, ")"),
+    ylab = "Time [sec]",
    las = 2,
-    at = c(1:len, 1:len + len + 1)
+    at = at
 )
--- a/CVE_R/DESCRIPTION
+++ b/CVE_R/DESCRIPTION
@ -1,4 +1,4 @@
-Package: CVE
+Package: CVEpureR
 Type: Package
 Title: Conditional Variance Estimator for Sufficient Dimension Reduction
 Version: 0.1
--- a/CVE_R/NAMESPACE
+++ b/CVE_R/NAMESPACE
@ -2,7 +2,6 @@

 S3method(plot,cve)
 S3method(summary,cve)
-export(col.pair.apply)
 export(cve)
 export(cve.call)
 export(cve.grid.search)
@ -15,7 +14,6 @@ export(estimate.bandwidth)
 export(grad)
 export(null)
 export(rStiefl)
-export(row.pair.apply)
 import(stats)
 importFrom(graphics,lines)
 importFrom(graphics,plot)
--- a/CVE_R/R/CVE.R
+++ b/CVE_R/R/CVE.R
@ -1,3 +1,17 @@
+#' Conditional Variance Estimator (CVE)
+#'
+#' Conditional Variance Estimator for Sufficient Dimension
+#'  Reduction
+#'
+#' TODO: And some details
+#'
+#'
+#' @references Fertl Likas, Bura Efstathia. Conditional Variance Estimation for Sufficient Dimension Reduction, 2019
+#'
+#' @docType package
+#' @author Loki
+"_PACKAGE"
+
 #' Implementation of the CVE method.
 #'
 #' Conditional Variance Estimator (CVE) is a novel sufficient dimension
@ -40,7 +54,7 @@
 #' @import stats
 #' @importFrom stats model.frame
 #' @export
-cve <- function(formula, data, method = "simple", ...) {
+cve <- function(formula, data, method = "simple", max.dim = 10, ...) {
    # check for type of `data` if supplied and set default
    if (missing(data)) {
        data <- environment(formula)
@ -69,12 +83,8 @@ cve <- function(formula, data, method = "simple", ...) {
 #' @param ... Method specific parameters.
 #' @rdname cve
 #' @export
-cve.call <- function(X, Y, method = "simple", nObs = nrow(X)^.5, k, ...) {
-
-    # TODO: replace default value of `k` by `max.dim` when fast enough
-    if (missing(k)) {
-        stop("TODO: parameter `k` (rank(B)) is required, replace by `max.dim`.")
-    }
+cve.call <- function(X, Y, method = "simple", nObs = nrow(X)^.5,
+                     min.dim = 1, max.dim = 10, k, ...) {

    # parameter checking
    if (!is.matrix(X)) {
@ -90,21 +100,42 @@ cve.call <- function(X, Y, method = "simple", nObs = nrow(X)^.5, k, ...) {
        stop('X is one dimensional, no need for dimension reduction.')
    }

+    if (!missing(k)) {
+        min.dim <- as.integer(k)
+        max.dim <- as.integer(k)
+    } else {
+        min.dim <- as.integer(min.dim)
+        max.dim <- as.integer(min(max.dim, ncol(X) - 1L))
+    }
+    if (min.dim > max.dim) {
+        stop('`min.dim` bigger `max.dim`.')
+    }
+    if (max.dim >= ncol(X)) {
+        stop('`max.dim` must be smaller than `ncol(X)`.')
+    }
+
    # Call specified method.
    method <- tolower(method)
-    if (method == 'simple') {
-        dr <- cve_simple(X, Y, k, nObs = nObs, ...)
-    } else if (method == 'linesearch') {
-        dr <- cve_linesearch(X, Y, k, nObs = nObs, ...)
-    } else if (method == 'sgd') {
-        dr <- cve_sgd(X, Y, k, nObs = nObs, ...)
-    } else {
-        stop('Got unknown method.')
+    call <- match.call()
+    dr <- list()
+    for (k in min.dim:max.dim) {
+        if (method == 'simple') {
+            dr.k <- cve_simple(X, Y, k, nObs = nObs, ...)
+        } else if (method == 'linesearch') {
+            dr.k <- cve_linesearch(X, Y, k, nObs = nObs, ...)
+        } else if (method == 'sgd') {
+            dr.k <- cve_sgd(X, Y, k, nObs = nObs, ...)
+        } else {
+            stop('Got unknown method.')
+        }
+        dr.k$k <- k
+        class(dr.k) <- "cve.k"
+        dr[[k]] <- dr.k
    }

    # augment result information
-    dr$method <- method
-    dr$call <- match.call()
+    dr.k$method <- method
+    dr.k$call <- call
    class(dr) <- "cve"
    return(dr)
 }
@ -134,41 +165,42 @@ cve.call <- function(X, Y, method = "simple", nObs = nrow(X)^.5, k, ...) {
 #' @export
 plot.cve <- function(x, ...) {

-    H <- x$history
-    H_1 <- H[!is.na(H[, 1]), 1]

-    defaults <- list(
-        main = "History",
-        xlab = "Iterations i",
-        ylab = expression(loss == L[n](V^{(i)})),
-        xlim = c(1, nrow(H)),
-        ylim = c(0, max(H)),
-        type = "l"
-    )
+    # H <- x$history
+    # H_1 <- H[!is.na(H[, 1]), 1]

-    call.plot <- match.call()
-    keys <- names(defaults)
-    keys <- keys[match(keys, names(call.plot)[-1], nomatch = 0) == 0]
+    # defaults <- list(
+    #     main = "History",
+    #     xlab = "Iterations i",
+    #     ylab = expression(loss == L[n](V^{(i)})),
+    #     xlim = c(1, nrow(H)),
+    #     ylim = c(0, max(H)),
+    #     type = "l"
+    # )

-    for (key in keys) {
-        call.plot[[key]] <- defaults[[key]]
-    }
+    # call.plot <- match.call()
+    # keys <- names(defaults)
+    # keys <- keys[match(keys, names(call.plot)[-1], nomatch = 0) == 0]

-    call.plot[[1L]] <- quote(plot)
-    call.plot$x <- quote(1:length(H_1))
-    call.plot$y <- quote(H_1)
+    # for (key in keys) {
+    #     call.plot[[key]] <- defaults[[key]]
+    # }

-    eval(call.plot)
+    # call.plot[[1L]] <- quote(plot)
+    # call.plot$x <- quote(1:length(H_1))
+    # call.plot$y <- quote(H_1)

-    if (ncol(H) > 1) {
-        for (i in 2:ncol(H)) {
-            H_i <- H[H[, i] > 0, i]
-            lines(1:length(H_i), H_i)
-        }
-    }
-    x.ends <- apply(H, 2, function(h) { length(h[!is.na(h)]) })
-    y.ends <- apply(H, 2, function(h) { tail(h[!is.na(h)], n=1) })
-    points(x.ends, y.ends)
+    # eval(call.plot)
+
+    # if (ncol(H) > 1) {
+    #     for (i in 2:ncol(H)) {
+    #         H_i <- H[H[, i] > 0, i]
+    #         lines(1:length(H_i), H_i)
+    #     }
+    # }
+    # x.ends <- apply(H, 2, function(h) { length(h[!is.na(h)]) })
+    # y.ends <- apply(H, 2, function(h) { tail(h[!is.na(h)], n=1) })
+    # points(x.ends, y.ends)
 }

 #' Prints a summary of a \code{cve} result.
--- a/CVE_R/R/cve_linesearch.R
+++ b/CVE_R/R/cve_linesearch.R
@ -13,18 +13,14 @@ cve_linesearch <- function(X, Y, k,
                           slack = 0,
                           epochs = 50L,
                           attempts = 10L,
-                           max.linesearch.iter = 10L
+                           max.linesearch.iter = 10L,
+                           logger = NULL
 ) {
    # Set `grad` functions environment to enable if to find this environments
    # local variabels, needed to enable the manipulation of this local variables
    # from within `grad`.
    environment(grad) <- environment()

-    # Setup histories.
-    loss.history <- matrix(NA, epochs, attempts)
-    error.history <- matrix(NA, epochs, attempts)
-    tau.history <- matrix(NA, epochs, attempts)
-
    # Get dimensions.
    n <- nrow(X)
    p <- ncol(X)
@ -44,8 +40,8 @@ cve_linesearch <- function(X, Y, k,
    # Compute lookup indexes for symmetrie, lower/upper
    # triangular parts and vectorization.
    pair.index <- elem.pairs(seq(n))
-    i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
-    j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
+    i <- pair.index[1, ] # `i` indices of `(i, j)` pairs
+    j <- pair.index[2, ] # `j` indices of `(i, j)` pairs
    # Matrix of vectorized indices. (vec(index) -> seq)
    index <- matrix(seq(n * n), n, n)
    lower <- index[lower.tri(index)]
@ -73,6 +69,13 @@ cve_linesearch <- function(X, Y, k,
        # Set last loss (aka, loss after applying the step).
        loss.last <- loss

+        # Call logger with initial values before starting optimization.
+        if (is.function(logger)) {
+            epoch <- 0 # Set epoch count to 0 (only relevant for logging).
+            error <- NA
+            logger(environment())
+        }
+
        ## Start optimization loop.
        for (epoch in 1:epochs) {

@ -124,16 +127,16 @@ cve_linesearch <- function(X, Y, k,
            # Compute error.
            error <- norm(V %*% t(V) - V.tau %*% t(V.tau), type = "F")

-            # Write history.
-            loss.history[epoch, attempt] <- loss
-            error.history[epoch, attempt] <- error
-            tau.history[epoch, attempt] <- tau
-
            # Check break condition (epoch check to skip ignored gradient calc).
            # Note: the devision by `sqrt(2 * k)` is included in `tol`.
            if (error < tol | epoch >= epochs) {
                # take last step and stop optimization.
                V <- V.tau
+                # Final call to the logger before stopping optimization
+                if (is.function(logger)) {
+                    G <- G.tau
+                    logger(environment())
+                }
                break()
            }

@ -141,6 +144,12 @@ cve_linesearch <- function(X, Y, k,
            V <- V.tau
            loss.last <- loss
            G <- G.tau
+
+            # Log after taking current step.
+            if (is.function(logger)) {
+                logger(environment())
+            }
+
        }

        # Check if current attempt improved previous ones
@ -152,9 +161,6 @@ cve_linesearch <- function(X, Y, k,
    }

    return(list(
-        loss.history = loss.history,
-        error.history = error.history,
-        tau.history = tau.history,
        loss = loss.best,
        V = V.best,
        B = null(V.best),
--- a/CVE_R/R/cve_sgd.R
+++ b/CVE_R/R/cve_sgd.R
@ -10,17 +10,14 @@ cve_sgd <- function(X, Y, k,
                    tol = 1e-3,
                    epochs = 50L,
                    batch.size = 16L,
-                    attempts = 10L
+                    attempts = 10L,
+                    logger = NULL
 ) {
    # Set `grad` functions environment to enable if to find this environments
    # local variabels, needed to enable the manipulation of this local variables
    # from within `grad`.
    environment(grad) <- environment()

-    # Setup histories.
-    loss.history <- matrix(NA, epochs, attempts)
-    error.history <- matrix(NA, epochs, attempts)
-
    # Get dimensions.
    n <- nrow(X) # Number of samples.
    p <- ncol(X) # Data dimensions
@ -32,7 +29,7 @@ cve_sgd <- function(X, Y, k,
    tol <- sqrt(2 * q) * tol

    # Estaimate bandwidth if not given.
-    if (missing(h) | !is.numeric(h)) {
+    if (missing(h) || !is.numeric(h)) {
        h <- estimate.bandwidth(X, k, nObs)
    }

@ -40,12 +37,11 @@ cve_sgd <- function(X, Y, k,
    # Compute lookup indexes for symmetrie, lower/upper
    # triangular parts and vectorization.
    pair.index <- elem.pairs(seq(n))
-    i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
-    j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
-    # Matrix of vectorized indices. (vec(index) -> seq)
-    index <- matrix(seq(n * n), n, n)
-    lower <- index[lower.tri(index)]
-    upper <- t(index)[lower]
+    i <- pair.index[1, ] # `i` indices of `(i, j)` pairs
+    j <- pair.index[2, ] # `j` indices of `(i, j)` pairs
+    # Index of vectorized matrix, for lower and upper triangular part.
+    lower <- ((i - 1) * n) + j
+    upper <- ((j - 1) * n) + i

    # Create all pairewise differences of rows of `X`.
    X_diff <- X[i, , drop = F] - X[j, , drop = F]
@ -69,17 +65,23 @@ cve_sgd <- function(X, Y, k,
        # Keep track of last `V` for computing error after an epoch.
        V.last <- V

+        if (is.function(logger)) {
+            loss <- grad(X, Y, V, h, loss.only = TRUE, persistent = TRUE)
+            error <- NA
+            epoch <- 0
+            logger(environment())
+        }
+
        # Repeat `epochs` times
        for (epoch in 1:epochs) {
            # Shuffle batches
            batch.shuffle <- sample(indices)

            # Make a step for each batch.
-            for (start in seq(1, n, batch.size)) {
+            for (batch.start in seq(1, n, batch.size)) {
                # Select batch data indices.
-                batch <- batch.shuffle[start:(start + batch.size - 1)]
-                # Remove `NA`'s (when `n` isn't a multiple of `batch.size`).
-                batch <- batch[!is.na(batch)]
+                batch.end <- min(batch.start + batch.size - 1, length(batch.shuffle))
+                batch <- batch.shuffle[batch.start:batch.end]

                # Compute batch gradient.
                loss <- NULL
@ -93,21 +95,24 @@ cve_sgd <- function(X, Y, k,
                # Parallet transport (on Stiefl manifold) into direction of `G`.
                V <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V)
            }
-            # Compute actuall loss after finishing optimization.
-            loss <- grad(X, Y, V, h, loss.only = TRUE, persistent = TRUE)
            # And the error for the history.
            error <- norm(V.last %*% t(V.last) - V %*% t(V), type = "F")
            V.last <- V

-            # Finaly write history.
-            loss.history[epoch, attempt] <- loss
-            error.history[epoch, attempt] <- error
+            if (is.function(logger)) {
+                # Compute loss at end of epoch for logging.
+                loss <- grad(X, Y, V, h, loss.only = TRUE, persistent = TRUE)
+                logger(environment())
+            }

            # Check break condition.
            if (error < tol) {
                break()
            }
        }
+        # Compute actual loss after finishing for comparing multiple attempts.
+        loss <- grad(X, Y, V, h, loss.only = TRUE, persistent = TRUE)
+
        # After each attempt, check if last attempt reached a better result.
        if (loss < loss.best) {
            loss.best <- loss
@ -116,8 +121,6 @@ cve_sgd <- function(X, Y, k,
    }

    return(list(
-        loss.history = loss.history,
-        error.history = error.history,
        loss = loss.best,
        V = V.best,
        B = null(V.best),
--- a/CVE_R/R/cve_simple.R
+++ b/CVE_R/R/cve_simple.R
@ -10,21 +10,18 @@ cve_simple <- function(X, Y, k,
                       tol = 1e-3,
                       slack = 0,
                       epochs = 50L,
-                       attempts = 10L
+                       attempts = 10L,
+                       logger = NULL
 ) {
    # Set `grad` functions environment to enable if to find this environments
    # local variabels, needed to enable the manipulation of this local variables
    # from within `grad`.
    environment(grad) <- environment()

-    # Setup histories.
-    loss.history <- matrix(NA, epochs, attempts)
-    error.history <- matrix(NA, epochs, attempts)
-
    # Get dimensions.
-    n <- nrow(X)
-    p <- ncol(X)
-    q <- p - k
+    n <- nrow(X) # Number of samples.
+    p <- ncol(X) # Data dimensions
+    q <- p - k   # Complement dimension of the SDR space.

    # Save initial learning rate `tau`.
    tau.init <- tau
@ -32,7 +29,7 @@ cve_simple <- function(X, Y, k,
    tol <- sqrt(2 * q) * tol

    # Estaimate bandwidth if not given.
-    if (missing(h) | !is.numeric(h)) {
+    if (missing(h) || !is.numeric(h)) {
        h <- estimate.bandwidth(X, k, nObs)
    }

@ -40,12 +37,11 @@ cve_simple <- function(X, Y, k,
    # Compute lookup indexes for symmetrie, lower/upper
    # triangular parts and vectorization.
    pair.index <- elem.pairs(seq(n))
-    i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
-    j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
-    # Matrix of vectorized indices. (vec(index) -> seq)
-    index <- matrix(seq(n * n), n, n)
-    lower <- index[lower.tri(index)]
-    upper <- t(index)[lower]
+    i <- pair.index[1, ] # `i` indices of `(i, j)` pairs
+    j <- pair.index[2, ] # `j` indices of `(i, j)` pairs
+    # Index of vectorized matrix, for lower and upper triangular part.
+    lower <- ((i - 1) * n) + j
+    upper <- ((j - 1) * n) + i

    # Create all pairewise differences of rows of `X`.
    X_diff <- X[i, , drop = F] - X[j, , drop = F]
@ -58,8 +54,7 @@ cve_simple <- function(X, Y, k,

    # Start loop for multiple attempts.
    for (attempt in 1:attempts) {
-
-        # reset step width `tau`
+        # Reset learning rate `tau`.
        tau <- tau.init

        # Sample a `(p, q)` dimensional matrix from the stiefel manifold as
@ -75,6 +70,13 @@ cve_simple <- function(X, Y, k,
        # Cayley transform matrix `A`
        A <- (G %*% t(V)) - (V %*% t(G))

+        # Call logger with initial values before starting optimization.
+        if (is.function(logger)) {
+            epoch <- 0 # Set epoch count to 0 (only relevant for logging).
+            error <- NA
+            logger(environment())
+        }
+
        ## Start optimization loop.
        for (epoch in 1:epochs) {
            # Apply learning rate `tau`.
@ -85,7 +87,7 @@ cve_simple <- function(X, Y, k,
            # Loss at position after a step.
            loss <- grad(X, Y, V.tau, h, loss.only = TRUE, persistent = TRUE)

-            # Check if step is appropriate
+            # Check if step is appropriate, iff not reduce learning rate.
            if ((loss - loss.last) > slack * loss.last) {
                tau <- tau / 2
                next() # Keep position and try with smaller `tau`.
@ -94,15 +96,15 @@ cve_simple <- function(X, Y, k,
            # Compute error.
            error <- norm(V %*% t(V) - V.tau %*% t(V.tau), type = "F")

-            # Write history.
-            loss.history[epoch, attempt] <- loss
-            error.history[epoch, attempt] <- error
-
            # Check break condition (epoch check to skip ignored gradient calc).
            # Note: the devision by `sqrt(2 * k)` is included in `tol`.
-            if (error < tol | epoch >= epochs) {
+            if (error < tol || epoch >= epochs) {
                # take last step and stop optimization.
                V <- V.tau
+                # Call logger last time befor stoping.
+                if (is.function(logger)) {
+                    logger(environment())
+                }
                break()
            }

@ -110,6 +112,11 @@ cve_simple <- function(X, Y, k,
            V <- V.tau
            loss.last <- loss

+            # Call logger after taking a step.
+            if (is.function(logger)) {
+                logger(environment())
+            }
+
            # Compute gradient at new position.
            G <- grad(X, Y, V, h, persistent = TRUE)

@ -126,8 +133,6 @@ cve_simple <- function(X, Y, k,
    }

    return(list(
-        loss.history = loss.history,
-        error.history = error.history,
        loss = loss.best,
        V = V.best,
        B = null(V.best),
--- a/CVE_R/R/gradient.R
+++ b/CVE_R/R/gradient.R
@ -24,12 +24,11 @@ grad <- function(X, Y, V, h,
        # Compute lookup indexes for symmetrie, lower/upper
        # triangular parts and vectorization.
        pair.index <- elem.pairs(seq(n))
-        i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
-        j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
-        # Matrix of vectorized indices. (vec(index) -> seq)
-        index <- matrix(seq(n * n), n, n)
-        lower <- index[lower.tri(index)]
-        upper <- t.default(index)[lower]
+        i <- pair.index[1, ] # `i` indices of `(i, j)` pairs
+        j <- pair.index[2, ] # `j` indices of `(i, j)` pairs
+        # Index of vectorized matrix, for lower and upper triangular part.
+        lower <- ((i - 1) * n) + j
+        upper <- ((j - 1) * n) + i

        # Create all pairewise differences of rows of `X`.
        X_diff <- X[i, , drop = F] - X[j, , drop = F]
@ -39,7 +38,7 @@ grad <- function(X, Y, V, h,
    Q <- diag(1, p) - tcrossprod(V, V)

    # Vectorized distance matrix `D`.
-    vecD <- rowSums((X_diff %*% Q)^2)
+    vecD <- colSums(tcrossprod(Q, X_diff)^2)

    # Weight matrix `W` (dnorm ... gaussean density function)
    W <- matrix(1, n, n) # `exp(0) == 1`
--- a/CVE_R/R/util.R
+++ b/CVE_R/R/util.R
@ -22,42 +22,19 @@ null <- function(V) {
    return(qr.Q(tmp, complete=TRUE)[, set, drop=FALSE])
 }

-#' Creates a (numeric) matrix where each row contains
+#' Creates a (numeric) matrix where each column contains
 #' an element to element matching.
 #' @param elements numeric vector of elements to match
-#' @return matrix of size `(n * (n - 1) / 2, 2)` for a argument of lenght `n`.
+#' @return matrix of size `(2, n * (n - 1) / 2)` for a argument of lenght `n`.
 #' @keywords internal
+#' @examples
+#'  elem.pairs(seq.int(2, 5))
 #' @export
 elem.pairs <- function(elements) {
    # Number of elements to match.
    n <- length(elements)
    # Create all combinations.
-    pairs <- cbind(rep(elements, each=n), rep(elements, n))
+    pairs <- rbind(rep(elements, each=n), rep(elements, n))
    # Select unique combinations without self interaction.
-    return(pairs[pairs[, 1] < pairs[, 2], ])
-}
-
-#' Apply function to pairs of matrix rows or columns.
-#'
-#' \code{row.pair.apply} applies \code{FUN} to each unique row pair without self
-#' interaction while \code{col.pair.apply} does the same for columns.
-#' @param X Matrix.
-#' @param FUN Function to apply to each pair.
-#' @examples
-#'  X <- matrix(seq(12), 4, 3)
-#' # matrix containing all row to row differences.
-#'  row.pair.apply(X, `-`)
-#' @aliases row.pair.apply, col.pair.apply
-#' @keywords internal
-#' @export
-row.pair.apply <- function(X, FUN) {
-    pairs <- elem.pairs(seq(nrow(X)))
-    return(FUN(X[pairs[, 1], ], X[pairs[, 2], ]))
-}
-#' @rdname row.pair.apply
-#' @keywords internal
-#' @export
-col.pair.apply <- function(X, FUN) {
-    pairs <- elem.pairs(seq(ncol(X)))
-    return(FUN(X[, pairs[, 1]], X[, pairs[, 2]]))
+    return(pairs[, pairs[1, ] < pairs[2, ]])
 }
--- a/CVE_R/demo/00Index
+++ b/CVE_R/demo/00Index
@ -0,0 +1,2 @@
+runtime_test    Runtime comparison of CVE against MAVE for M1 - M5 datasets.
+logging         Example of a logger function for cve algorithm analysis.
--- a/CVE_R/demo/logging.R
+++ b/CVE_R/demo/logging.R
@ -0,0 +1,43 @@
+library(CVEpureR)
+
+# Setup histories.
+(epochs <- 50)
+(attempts <- 10)
+loss.history       <- matrix(NA, epochs + 1, attempts)
+error.history      <- matrix(NA, epochs + 1, attempts)
+tau.history        <- matrix(NA, epochs + 1, attempts)
+true.error.history <- matrix(NA, epochs + 1, attempts)
+
+# Create a dataset
+ds <- dataset("M1")
+X <- ds$X
+Y <- ds$Y
+B <- ds$B # the true `B`
+(k <- ncol(ds$B))
+
+# True projection matrix.
+P <- B %*% solve(t(B) %*% B) %*% t(B)
+# Define the logger for the `cve()` method.
+logger <- function(env) {
+    # Note the `<<-` assignement!
+    loss.history[env$epoch + 1, env$attempt] <<- env$loss
+    error.history[env$epoch + 1, env$attempt] <<- env$error
+    tau.history[env$epoch + 1, env$attempt] <<- env$tau
+    # Compute true error by comparing to the true `B`
+    B.est <- null(env$V) # Function provided by CVE
+    P.est <- B.est %*% solve(t(B.est) %*% B.est) %*% t(B.est)
+    true.error <- norm(P - P.est, 'F') / sqrt(2 * k)
+    true.error.history[env$epoch + 1, env$attempt] <<- true.error
+}
+# Performe SDR for ONE `k`.
+dr <- cve(Y ~ X, k = k, logger = logger, epochs = epochs, attempts = attempts)
+# Plot history's
+par(mfrow = c(2, 2))
+matplot(loss.history,       type = 'l', log = 'y', xlab = 'iter',
+        main = 'loss', ylab = expression(L(V[iter])))
+matplot(error.history,      type = 'l', log = 'y', xlab = 'iter',
+        main = 'error', ylab = 'error')
+matplot(tau.history,        type = 'l', log = 'y', xlab = 'iter',
+        main = 'tau', ylab = 'tau')
+matplot(true.error.history, type = 'l', log = 'y', xlab = 'iter',
+        main = 'true error', ylab = 'true error')
--- a/CVE_R/demo/runtime_test.R
+++ b/CVE_R/demo/runtime_test.R
@ -0,0 +1,89 @@
+# Usage:
+# ~$ Rscript runtime_test.R
+
+library(CVEpureR) # load CVE
+
+#' Writes progress to console.
+tell.user <- function(name, start.time, i, length) {
+    cat("\rRunning Test (", name, "):",
+        i, "/", length,
+        " - elapsed:", format(Sys.time() - start.time), "\033[K")
+}
+subspace.dist <- function(B1, B2){
+    P1 <- B1 %*% solve(t(B1) %*% B1) %*% t(B1)
+    P2 <- B2 %*% solve(t(B2) %*% B2) %*% t(B2)
+    return(norm(P1 - P2, type = 'F'))
+}
+
+# Number of simulations
+SIM.NR <- 50
+# maximal number of iterations in curvilinear search algorithm
+MAXIT <- 50
+# number of arbitrary starting values for curvilinear optimization
+ATTEMPTS <- 10
+# set names of datasets
+dataset.names <- c("M1", "M2", "M3", "M4", "M5")
+# Set used CVE method
+methods <- c("simple") # c("legacy", "simple", "sgd", "linesearch")
+
+# Setup error and time tracking variables
+error <- matrix(NA, SIM.NR, length(methods) * length(dataset.names))
+time <- matrix(NA, SIM.NR, ncol(error))
+colnames(error) <- kronecker(paste0(dataset.names, '-'), methods, paste0)
+colnames(time) <- colnames(error)
+
+# only for telling user (to stdout)
+count <- 0
+start.time <- Sys.time()
+# Start simulation loop.
+for (sim in 1:SIM.NR) {
+    # Repeat for each dataset.
+    for (name in dataset.names) {
+        count <- count + 1
+        tell.user(name, start.time, count, SIM.NR * length(dataset.names))
+
+        # Create a new dataset
+        ds <- dataset(name)
+        # Prepare X, Y and combine to data matrix
+        Y <- ds$Y
+        X <- ds$X
+        data <- cbind(Y, X)
+        # get dimensions
+        dim <- ncol(X)
+        truedim <- ncol(ds$B)
+
+        for (method in methods) {
+            dr.time <- system.time(
+                dr <- cve.call(X, Y,
+                    method = method,
+                    k = truedim,
+                    attempts = ATTEMPTS
+                )
+            )
+            dr <- dr[[truedim]]
+
+            key <- paste0(name, '-', method)
+            error[sim, key] <- subspace.dist(dr$B, ds$B) / sqrt(2 * truedim)
+            time[sim, key] <- dr.time["elapsed"]
+        }
+    }
+}
+
+cat("\n\n## Time [sec] Means:\n")
+print(colMeans(time))
+cat("\n## Error Means:\n")
+print(colMeans(error))
+
+at <- seq(ncol(error)) + rep(seq(ncol(error) / length(methods)) - 1, each = length(methods))
+boxplot(error,
+    main = paste0("Error (Nr of simulations ", SIM.NR, ")"),
+    ylab = "Error",
+    las = 2,
+    at = at
+)
+boxplot(time,
+    main = paste0("Time (Nr of simulations ", SIM.NR, ")"),
+    ylab = "Time [sec]",
+    las = 2,
+    at = at
+)
--- a/CVE_R/man/CVEpureR-package.Rd
+++ b/CVE_R/man/CVEpureR-package.Rd
@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/CVE.R
+\docType{package}
+\name{CVEpureR-package}
+\alias{CVEpureR}
+\alias{CVEpureR-package}
+\title{Conditional Variance Estimator (CVE)}
+\description{
+Conditional Variance Estimator for Sufficient Dimension
+ Reduction
+}
+\details{
+TODO: And some details
+}
+\references{
+Fertl Likas, Bura Efstathia. Conditional Variance Estimation for Sufficient Dimension Reduction, 2019
+}
+\author{
+Loki
+}
--- a/CVE_R/man/cve.Rd
+++ b/CVE_R/man/cve.Rd
@ -5,9 +5,10 @@
 \alias{cve.call}
 \title{Implementation of the CVE method.}
 \usage{
-cve(formula, data, method = "simple", ...)
+cve(formula, data, method = "simple", max.dim = 10, ...)

-cve.call(X, Y, method = "simple", nObs = nrow(X)^0.5, k, ...)
+cve.call(X, Y, method = "simple", nObs = nrow(X)^0.5, min.dim = 1,
+  max.dim = 10, k, ...)
 }
 \arguments{
 \item{formula}{Formel for the regression model defining `X`, `Y`.
--- a/CVE_R/man/cve_linesearch.Rd
+++ b/CVE_R/man/cve_linesearch.Rd
@ -7,7 +7,7 @@ conditions.}
 \usage{
 cve_linesearch(X, Y, k, nObs = sqrt(nrow(X)), h = NULL, tau = 1,
  tol = 0.001, rho1 = 0.1, rho2 = 0.9, slack = 0, epochs = 50L,
-  attempts = 10L, max.linesearch.iter = 10L)
+  attempts = 10L, max.linesearch.iter = 10L, logger = NULL)
 }
 \description{
 Implementation of the CVE method using curvilinear linesearch with Armijo-Wolfe
--- a/CVE_R/man/cve_sgd.Rd
+++ b/CVE_R/man/cve_sgd.Rd
@ -6,7 +6,8 @@
 a classic GD method unsing no further tricks.}
 \usage{
 cve_sgd(X, Y, k, nObs = sqrt(nrow(X)), h = NULL, tau = 0.01,
-  tol = 0.001, epochs = 50L, batch.size = 16L, attempts = 10L)
+  tol = 0.001, epochs = 50L, batch.size = 16L, attempts = 10L,
+  logger = NULL)
 }
 \description{
 Simple implementation of the CVE method. 'Simple' means that this method is
--- a/CVE_R/man/cve_simple.Rd
+++ b/CVE_R/man/cve_simple.Rd
@ -6,7 +6,8 @@
 a classic GD method unsing no further tricks.}
 \usage{
 cve_simple(X, Y, k, nObs = sqrt(nrow(X)), h = NULL, tau = 1,
-  tol = 0.001, slack = 0, epochs = 50L, attempts = 10L)
+  tol = 0.001, slack = 0, epochs = 50L, attempts = 10L,
+  logger = NULL)
 }
 \description{
 Simple implementation of the CVE method. 'Simple' means that this method is
--- a/CVE_R/man/elem.pairs.Rd
+++ b/CVE_R/man/elem.pairs.Rd
@ -2,7 +2,7 @@
 % Please edit documentation in R/util.R
 \name{elem.pairs}
 \alias{elem.pairs}
-\title{Creates a (numeric) matrix where each row contains
+\title{Creates a (numeric) matrix where each column contains
 an element to element matching.}
 \usage{
 elem.pairs(elements)
@ -11,10 +11,13 @@ elem.pairs(elements)
 \item{elements}{numeric vector of elements to match}
 }
 \value{
-matrix of size `(n * (n - 1) / 2, 2)` for a argument of lenght `n`.
+matrix of size `(2, n * (n - 1) / 2)` for a argument of lenght `n`.
 }
 \description{
-Creates a (numeric) matrix where each row contains
+Creates a (numeric) matrix where each column contains
 an element to element matching.
 }
+\examples{
+ elem.pairs(seq.int(2, 5))
+}
 \keyword{internal}
--- a/CVE_R/man/row.pair.apply.Rd
+++ b/CVE_R/man/row.pair.apply.Rd
@ -1,27 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/util.R
-\name{row.pair.apply}
-\alias{row.pair.apply}
-\alias{row.pair.apply,}
-\alias{col.pair.apply}
-\title{Apply function to pairs of matrix rows or columns.}
-\usage{
-row.pair.apply(X, FUN)
-
-col.pair.apply(X, FUN)
-}
-\arguments{
-\item{X}{Matrix.}
-
-\item{FUN}{Function to apply to each pair.}
-}
-\description{
-\code{row.pair.apply} applies \code{FUN} to each unique row pair without self
-interaction while \code{col.pair.apply} does the same for columns.
-}
-\examples{
- X <- matrix(seq(12), 4, 3)
-# matrix containing all row to row differences.
- row.pair.apply(X, `-`)
-}
-\keyword{internal}
--- a/notes.md
+++ b/notes.md
@ -1,3 +1,12 @@
+# General Notes for Souce Code analysis
+## Search in multiple files.
+Using the Linux `grep` program with the parameters `-rnw` and specifying a include files filter like the following example.
+```bash
+grep --include=*\.{c,h,R} -rnw '.' -e "sweep"
+```
+searches in all `C` source and header fils as well as `R` source files for the term _sweep_.
+
+# Development
 ## Build and install.
 To build the package the `devtools` package is used. This also provides `roxygen2` which is used for documentation and authomatic creaton of the `NAMESPACE` file.
 ```R
@ -29,6 +38,55 @@ install.packages(path, repos = NULL, type = "source")
 ```
 **Note: I only recommend this approach during development.**

+# Analysing
+## Logging (a `cve` run).
+To log `loss`, `error` (estimated) the true error (error of current estimated `B` against the true `B`) or even the stepsize one can use the `logger` parameter. A `logger` is a function that gets the current `environment` of the CVE optimization methods (__do not alter this environment, only read from it__). This can be used to create logs like in the following example.
+```R
+library(CVE)
+
+# Setup histories.
+(epochs <- 50)
+(attempts <- 10)
+loss.history       <- matrix(NA, epochs + 1, attempts)
+error.history      <- matrix(NA, epochs + 1, attempts)
+tau.history        <- matrix(NA, epochs + 1, attempts)
+true.error.history <- matrix(NA, epochs + 1, attempts)
+
+# Create a dataset
+ds <- dataset("M1")
+X <- ds$X
+Y <- ds$Y
+B <- ds$B # the true `B`
+(k <- ncol(ds$B))
+
+# True projection matrix.
+P <- B %*% solve(t(B) %*% B) %*% t(B)
+# Define the logger for the `cve()` method.
+logger <- function(env) {
+    # Note the `<<-` assignement!
+    loss.history[env$epoch + 1, env$attempt] <<- env$loss
+    error.history[env$epoch + 1, env$attempt] <<- env$error
+    tau.history[env$epoch + 1, env$attempt] <<- env$tau
+    # Compute true error by comparing to the true `B`
+    B.est <- null(env$V) # Function provided by CVE
+    P.est <- B.est %*% solve(t(B.est) %*% B.est) %*% t(B.est)
+    true.error <- norm(P - P.est, 'F') / sqrt(2 * k)
+    true.error.history[env$epoch + 1, env$attempt] <<- true.error
+}
+# Performa SDR
+dr <- cve(Y ~ X, k = k, logger = logger, epochs = epochs, attempts = attempts)
+# Plot history's
+par(mfrow = c(2, 2))
+matplot(loss.history,       type = 'l', log = 'y', xlab = 'iter',
+        main = 'loss', ylab = expression(L(V[iter])))
+matplot(error.history,      type = 'l', log = 'y', xlab = 'iter',
+        main = 'error', ylab = 'error')
+matplot(tau.history,        type = 'l', log = 'y', xlab = 'iter',
+        main = 'tau', ylab = 'tau')
+matplot(true.error.history, type = 'l', log = 'y', xlab = 'iter',
+        main = 'true error', ylab = 'true error')
+```
+
 ## Reading log files.
 The runtime tests (upcomming further tests) are creating log files saved in `tmp/`. These log files are `CSV` files (actualy `TSV`) with a header storing the test results. Depending on the test the files may contain differnt data. As an example we use the runtime test logs which store in each line the `dataset`, the used `method` as well as the `error` (actual error of estimated `B` against real `B`) and the `time`. For reading and analysing the data see the following example.
 ```R
@ -66,7 +124,9 @@ jedi.seeks <- function() {
 }

 trooper.seeks()
+# [1] "These aren't the droids you're looking for."
 jedi.seeks()
+# [1] "R2-D2", "C-3PO"
 ```

 The next example ilustrates how to write (without local copies) to variables outside the functions local environment.
@ -127,27 +187,25 @@ library(microbenchmark)

 A <- matrix(runif(120), 12, 10)

-# Matrix trace.
-tr <- function(M) sum(diag(M))
-
 # Check correctnes and benckmark performance.
 stopifnot(
    all.equal(
-        tr(t(A) %*% A),
-        sum(diag(t(A) %*% A)),
-        sum(A * A)
+        sum(diag(t(A) %*% A)), sum(diag(crossprod(A, A)))
+    ),
+    all.equal(
+        sum(diag(t(A) %*% A)), sum(A * A)
    )
 )
 microbenchmark(
-    tr(t(A) %*% A),
-    sum(diag(t(A) %*% A)),
-    sum(A * A)
+    MM = sum(diag(t(A) %*% A)),
+    cross = sum(diag(crossprod(A, A))),
+    elem = sum(A * A)
 )
 # Unit: nanoseconds
-#                   expr  min   lq    mean median     uq   max neval
-#         tr(t(A) %*% A) 4335 4713 5076.36 4949.5 5402.5  7928   100
-#  sum(diag(t(A) %*% A)) 4106 4429 5233.89 4733.5 5057.5 49308   100
-#             sum(A * A)  540  681  777.07  740.0  818.5  3572   100
+#   expr  min     lq    mean median     uq   max neval
+#     MM 4232 4570.0 5138.81   4737 4956.0 40308   100
+#  cross 2523 2774.5 2974.93   2946 3114.5  5078   100
+#   elem  582  762.5  973.02    834  964.0 12945   100
 ```

 ```R
@ -243,6 +301,31 @@ microbenchmark(
 #  outer 1141.479 1216.929 1404.702 1317.7315 1582.800  2531.766   100
 ```

+### Fast dist matrix computation (aka. row sum of squares).
+```R
+library(microbenchmark)
+library(CVE)
+
+(n <- 200)
+(N <- n * (n - 1) / 2)
+(p <- 12)
+M <- matrix(runif(N * p), N, p)
+
+stopifnot(
+    all.equal(rowSums(M^2), rowSums.c(M^2)),
+    all.equal(rowSums(M^2), rowSquareSums.c(M))
+)
+microbenchmark(
+    sums = rowSums(M^2),
+    sums.c = rowSums.c(M^2),
+    sqSums.c = rowSquareSums.c(M)
+)
+# Unit: microseconds
+#      expr     min       lq      mean    median       uq      max neval
+#      sums 666.311 1051.036 1612.3100 1139.0065 1547.657 13940.97   100
+#    sums.c 342.647  672.453 1009.9109  740.6255 1224.715 13765.90   100
+#  sqSums.c 115.325  142.128  175.6242  153.4645  169.678   759.87   100
+```

 ## Using `Rprof()` for performance.
 The standart method for profiling where an algorithm is spending its time is with `Rprof()`.
--- a/runtime_test.R
+++ b/runtime_test.R
@ -14,7 +14,7 @@ subspace.dist <- function(B1, B2){
 }

 # Number of simulations
-SIM.NR <- 50
+SIM.NR <- 20
 # maximal number of iterations in curvilinear search algorithm
 MAXIT <- 50
 # number of arbitrary starting values for curvilinear optimization
@ -22,7 +22,7 @@ ATTEMPTS <- 10
 # set names of datasets
 dataset.names <- c("M1", "M2", "M3", "M4", "M5")
 # Set used CVE method
-methods <- c("simple") #, "sgd") # "legacy"
+methods <- c("simple") # c("legacy", "simple", "sgd", "linesearch")

 library(CVE) # load CVE
 if ("legacy" %in% methods) {
@ -83,6 +83,7 @@ for (sim in 1:SIM.NR) {
                        attempts = ATTEMPTS
                    )
                )
+                dr <- dr[[truedim]]
            }

            key <- paste0(name, '-', method)
--- a/wip.R
+++ b/wip.R
@ -0,0 +1,329 @@
+library(microbenchmark)
+
+dyn.load("wip.so")
+
+
+## rowSum* .call --------------------------------------------------------------
+rowSums.c <- function(M) {
+    stopifnot(
+        is.matrix(M),
+        is.numeric(M)
+    )
+    if (!is.double(M)) {
+        M <- matrix(as.double(M), nrow = nrow(M))
+    }
+
+    .Call('R_rowSums', PACKAGE = 'wip', M)
+}
+colSums.c <- function(M) {
+    stopifnot(
+        is.matrix(M),
+        is.numeric(M)
+    )
+    if (!is.double(M)) {
+        M <- matrix(as.double(M), nrow = nrow(M))
+    }
+
+    .Call('R_colSums', PACKAGE = 'wip', M)
+}
+rowSquareSums.c <- function(M) {
+    stopifnot(
+        is.matrix(M),
+        is.numeric(M)
+    )
+    if (!is.double(M)) {
+        M <- matrix(as.double(M), nrow = nrow(M))
+    }
+
+    .Call('R_rowSquareSums', PACKAGE = 'wip', M)
+}
+rowSumsSymVec.c <- function(vecA, nrow, diag = 0.0) {
+    stopifnot(
+        is.vector(vecA),
+        is.numeric(vecA),
+        is.numeric(diag),
+        nrow * (nrow - 1) == length(vecA) * 2
+    )
+    if (!is.double(vecA)) {
+        vecA <- as.double(vecA)
+    }
+    .Call('R_rowSumsSymVec', PACKAGE = 'wip',
+          vecA, as.integer(nrow), as.double(diag))
+}
+rowSweep.c <- function(A, v, op = '-') {
+    stopifnot(
+        is.matrix(A),
+        is.numeric(v)
+    )
+    if (!is.double(A)) {
+        A <- matrix(as.double(A), nrow = nrow(A))
+    }
+    if (!is.vector(v) || !is.double(v)) {
+        v <- as.double(v)
+    }
+    stopifnot(
+        nrow(A) == length(v),
+        op %in% c('+', '-', '*', '/')
+    )
+
+    .Call('R_rowSweep', PACKAGE = 'wip', A, v, op)
+}
+
+## row*, col* tests ------------------------------------------------------------
+n <- 3000
+M <- matrix(runif(n * 12), n, 12)
+stopifnot(
+    all.equal(rowSums(M^2), rowSums.c(M^2)),
+    all.equal(colSums(M), colSums.c(M))
+)
+microbenchmark(
+    rowSums     = rowSums(M^2),
+    rowSums.c   = rowSums.c(M^2),
+    rowSqSums.c = rowSquareSums.c(M)
+)
+microbenchmark(
+    colSums   = colSums(M),
+    colSums.c = colSums.c(M)
+)
+
+sum = rowSums(M)
+stopifnot(all.equal(
+    sweep(M, 1, sum, FUN = `/`),
+    rowSweep.c(M, sum, '/') # Col-Normalize)
+), all.equal(
+    sweep(M, 1, sum, FUN = `/`),
+    M / sum
+))
+microbenchmark(
+    sweep = sweep(M, 1, sum, FUN = `/`),
+    M / sum,
+    rowSweep.c = rowSweep.c(M, sum, '/') # Col-Normalize)
+)
+
+# Create symmetric matrix with constant diagonal entries.
+nrow <- 200
+diag <- 1.0
+Sym <- tcrossprod(runif(nrow))
+diag(Sym) <- diag
+# Get vectorized lower triangular part of `Sym` matrix.
+SymVec <- Sym[lower.tri(Sym)]
+stopifnot(all.equal(
+    rowSums(Sym),
+    rowSumsSymVec.c(SymVec, nrow, diag)
+))
+microbenchmark(
+    rowSums = rowSums(Sym),
+    rowSums.c = rowSums.c(Sym),
+    rowSumsSymVec.c = rowSumsSymVec.c(SymVec, nrow, diag)
+)
+
+## Matrix-Matrix opperation .call ---------------------------------------------
+matrixprod.c <- function(A, B) {
+    stopifnot(
+        is.matrix(A), is.numeric(A),
+        is.matrix(B), is.numeric(B),
+        ncol(A) == nrow(B)
+    )
+    if (!is.double(A)) {
+        A <- matrix(as.double(A), nrow = nrow(A))
+    }
+    if (!is.double(B)) {
+        B <- matrix(as.double(B), nrow = nrow(B))
+    }
+
+    .Call('R_matrixprod', PACKAGE = 'wip', A, B)
+}
+crossprod.c <- function(A, B) {
+    stopifnot(
+        is.matrix(A), is.numeric(A),
+        is.matrix(B), is.numeric(B),
+        nrow(A) == nrow(B)
+    )
+    if (!is.double(A)) {
+        A <- matrix(as.double(A), nrow = nrow(A))
+    }
+    if (!is.double(B)) {
+        B <- matrix(as.double(B), nrow = nrow(B))
+    }
+
+    .Call('R_crossprod', PACKAGE = 'wip', A, B)
+}
+skewSymRank2k.c <- function(A, B, alpha = 1, beta = 0) {
+    stopifnot(
+        is.matrix(A), is.numeric(A),
+        is.matrix(B), is.numeric(B),
+        all(dim(A) == dim(B)),
+        is.numeric(alpha), length(alpha) == 1L,
+        is.numeric(beta), length(beta) == 1L
+    )
+    if (!is.double(A)) {
+        A <- matrix(as.double(A), nrow = nrow(A))
+    }
+    if (!is.double(B)) {
+        B <- matrix(as.double(B), nrow = nrow(B))
+    }
+
+    .Call('R_skewSymRank2k', PACKAGE = 'wip', A, B,
+          as.double(alpha), as.double(beta))
+}
+
+## Matrix-Matrix opperation tests ---------------------------------------------
+n <- 200
+k <- 100
+m <- 300
+
+A <- matrix(runif(n * k), n, k)
+B <- matrix(runif(k * m), k, m)
+stopifnot(
+    all.equal(A %*% B, matrixprod.c(A, B))
+)
+microbenchmark(
+    "%*%" = A %*% B,
+    matrixprod.c = matrixprod.c(A, B)
+)
+
+A <- matrix(runif(k * n), k, n)
+B <- matrix(runif(k * m), k, m)
+stopifnot(
+    all.equal(crossprod(A, B), crossprod.c(A, B))
+)
+microbenchmark(
+    crossprod = crossprod(A, B),
+    crossprod.c = crossprod.c(A, B)
+)
+
+n <- 12
+k <- 11
+A <- matrix(runif(n * k), n, k)
+B <- matrix(runif(n * k), n, k)
+stopifnot(all.equal(
+    A %*% t(B) - B %*% t(A), skewSymRank2k.c(A, B)
+))
+microbenchmark(
+    A %*% t(B) - B %*% t(A),
+    skewSymRank2k.c(A, B)
+)
+
+## Orthogonal projection onto null space .Call --------------------------------
+nullProj.c <- function(B) {
+    stopifnot(
+        is.matrix(B), is.numeric(B)
+    )
+    if (!is.double(B)) {
+        B <- matrix(as.double(B), nrow = nrow(B))
+    }
+
+    .Call('R_nullProj', PACKAGE = 'wip', B)
+}
+## Orthogonal projection onto null space tests --------------------------------
+p <- 12
+q <- 10
+V <- qr.Q(qr(matrix(rnorm(p * q, 0, 1), p, q)))
+
+# Projection matrix onto `span(V)`
+Q <- diag(1, p) - tcrossprod(V, V)
+stopifnot(
+    all.equal(Q, nullProj.c(V))
+)
+microbenchmark(
+    nullProj = diag(1, p) - tcrossprod(V, V),
+    nullProj.c = nullProj.c(V)
+)
+
+
+# ## WIP for gradient. ----------------------------------------------------------
+
+gradient.c <- function(X, X_diff, Y, V, h) {
+    stopifnot(
+        is.matrix(X), is.double(X),
+        is.matrix(X_diff), is.double(X_diff),
+            ncol(X_diff) == ncol(X), nrow(X_diff) == nrow(X) * (nrow(X) - 1) / 2,
+        is.vector(Y) || (is.matrix(Y) && pmin(dim(Y)) == 1L), is.double(Y),
+            length(Y) == nrow(X),
+        is.matrix(V), is.double(V),
+            nrow(V) == ncol(X),
+        is.vector(h), is.numeric(h), length(h) == 1
+    )
+
+    .Call('R_gradient', PACKAGE = 'wip',
+          X, X_diff, as.double(Y), V, as.double(h));
+}
+
+elem.pairs <- function(elements) {
+    # Number of elements to match.
+    n <- length(elements)
+    # Create all combinations.
+    pairs <- rbind(rep(elements, each=n), rep(elements, n))
+    # Select unique combinations without self interaction.
+    return(pairs[, pairs[1, ] < pairs[2, ]])
+}
+grad <- function(X, Y, V, h, persistent = TRUE) {
+    n <- nrow(X)
+    p <- ncol(X)
+
+    if (!persistent) {
+        pair.index <- elem.pairs(seq(n))
+        i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
+        j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
+        lower <- ((i - 1) * n) + j
+        upper <- ((j - 1) * n) + i
+        X_diff <- X[i, , drop = F] - X[j, , drop = F]
+    }
+
+    # Projection matrix onto `span(V)`
+    Q <- diag(1, p) - tcrossprod(V, V)
+    # Vectorized distance matrix `D`.
+    vecD <- rowSums((X_diff %*% Q)^2)
+
+
+    # Weight matrix `W` (dnorm ... gaussean density function)
+    W <- matrix(1, n, n) # `exp(0) == 1`
+    W[lower] <- exp((-0.5 / h) * vecD^2) # Set lower tri. part
+    W[upper] <- t.default(W)[upper] # Mirror lower tri. to upper
+    W <- sweep(W, 2, colSums(W), FUN = `/`) # Col-Normalize
+
+    # Weighted `Y` momentums
+    y1 <- Y %*% W # Result is 1D -> transposition irrelevant
+    y2 <- Y^2 %*% W
+
+    # Per example loss `L(V, X_i)`
+    L <- y2 - y1^2
+
+    # Vectorized Weights with forced symmetry
+    vecS <- (L[i] - (Y[j] - y1[i])^2) * W[lower]
+    vecS <- vecS + ((L[j] - (Y[i] - y1[j])^2) * W[upper])
+    # Compute scaling of `X` row differences
+    vecS <- vecS * vecD
+
+    G <- crossprod(X_diff, X_diff * vecS) %*% V
+    G <- (-2 / (n * h^2)) * G
+    return(G)
+}
+rStiefl <- function(p, q) {
+    return(qr.Q(qr(matrix(rnorm(p * q, 0, 1), p, q))))
+}
+
+n <- 200
+p <- 12
+q <- 10
+
+X <- matrix(runif(n * p), n, p)
+Y <- runif(n)
+V <- rStiefl(p, q)
+h <- 0.1
+
+pair.index <- elem.pairs(seq(n))
+i <- pair.index[1, ] # `i` indices of `(i, j)` pairs
+j <- pair.index[2, ] # `j` indices of `(i, j)` pairs
+lower <- ((i - 1) * n) + j
+upper <- ((j - 1) * n) + i
+X_diff <- X[i, , drop = F] - X[j, , drop = F]
+
+stopifnot(all.equal(
+    grad(X, Y, V, h),
+    gradient.c(X, X_diff, Y, V, h)
+))
+microbenchmark(
+    grad = grad(X, Y, V, h),
+    gradient.c = gradient.c(X, X_diff, Y, V, h)
+)
--- a/wip.c
+++ b/wip.c
@ -0,0 +1,421 @@
+#include <stdlib.h>
+
+#include <R_ext/BLAS.h>
+#include <R_ext/Lapack.h>
+#include <R_ext/Error.h>
+// #include <Rmath.h>
+
+#include "wip.h"
+
+static inline void rowSums(const double *A,
+                           const int nrow, const int ncol,
+                           double *sum) {
+    int i, j, block_size, block_size_i;
+    const double *A_block = A;
+    const double *A_end = A + nrow * ncol;
+
+    if (nrow > CVE_MEM_CHUNK_SIZE) {
+        block_size = CVE_MEM_CHUNK_SIZE;
+    } else {
+        block_size = nrow;
+    }
+
+    // Iterate `(block_size_i, ncol)` submatrix blocks.
+    for (i = 0; i < nrow; i += block_size_i) {
+        // Reset `A` to new block beginning.
+        A = A_block;
+        // Take block size of eveything left and reduce to max size.
+        block_size_i = nrow - i;
+        if (block_size_i > block_size) {
+            block_size_i = block_size;
+        }
+        // Compute first blocks column,
+        for (j = 0; j < block_size_i; ++j) {
+            sum[j] = A[j];
+        }
+        // and sum the following columns to the first one.
+        for (A += nrow; A < A_end; A += nrow) {
+            for (j = 0; j < block_size_i; ++j) {
+                sum[j] += A[j];
+            }
+        }
+        // Step one block forth.
+        A_block += block_size_i;
+        sum += block_size_i;
+    }
+}
+
+static inline void colSums(const double *A,
+                           const int nrow, const int ncol,
+                           double *sum) {
+    int j;
+    double *sum_end = sum + ncol;
+
+    memset(sum, 0, sizeof(double) * ncol);
+    for (; sum < sum_end; ++sum) {
+        for (j = 0; j < nrow; ++j) {
+            *sum += A[j];
+        }
+        A += nrow;
+    }
+}
+
+static inline void rowSquareSums(const double *A,
+                                 const int nrow, const int ncol,
+                                 double *sum) {
+    int i, j, block_size, block_size_i;
+    const double *A_block = A;
+    const double *A_end = A + nrow * ncol;
+
+    if (nrow < CVE_MEM_CHUNK_SIZE) {
+        block_size = nrow;
+    } else {
+        block_size = CVE_MEM_CHUNK_SIZE;
+    }
+
+    // Iterate `(block_size_i, ncol)` submatrix blocks.
+    for (i = 0; i < nrow; i += block_size_i) {
+        // Reset `A` to new block beginning.
+        A = A_block;
+        // Take block size of eveything left and reduce to max size.
+        block_size_i = nrow - i;
+        if (block_size_i > block_size) {
+            block_size_i = block_size;
+        }
+        // Compute first blocks column,
+        for (j = 0; j < block_size_i; ++j) {
+            sum[j] = A[j] * A[j];
+        }
+        // and sum the following columns to the first one.
+        for (A += nrow; A < A_end; A += nrow) {
+            for (j = 0; j < block_size_i; ++j) {
+                sum[j] += A[j] * A[j];
+            }
+        }
+        // Step one block forth.
+        A_block += block_size_i;
+        sum += block_size_i;
+    }
+}
+
+static inline void rowSumsSymVec(const double *Avec, const int nrow,
+                                 const double *diag,
+                                 double *sum) {
+    int i, j;
+
+    if (*diag == 0.0) {
+        memset(sum, 0, nrow * sizeof(double));
+    } else {
+        for (i = 0; i < nrow; ++i) {
+            sum[i] = *diag;
+        }
+    }
+
+    for (j = 0; j < nrow; ++j) {
+        for (i = j + 1; i < nrow; ++i, ++Avec) {
+            sum[j] += *Avec;
+            sum[i] += *Avec;
+        }
+    }
+}
+
+/* C[, j] = A[, j] * v for each j = 1 to ncol */
+static void rowSweep(const double *A, const int nrow, const int ncol,
+                     const char* op,
+                     const double *v, // vector of length nrow
+                     double *C) {
+    int i, j, block_size, block_size_i;
+    const double *A_block = A;
+    double *C_block = C;
+    const double *A_end = A + nrow * ncol;
+
+    if (nrow > CVE_MEM_CHUNK_SMALL) { // small because 3 vectors in cache
+        block_size = CVE_MEM_CHUNK_SMALL;
+    } else {
+        block_size = nrow;
+    }
+
+    if (*op == '+') {
+        // Iterate `(block_size_i, ncol)` submatrix blocks.
+        for (i = 0; i < nrow; i += block_size_i) {
+            // Set `A` and `C` to block beginning.
+            A = A_block;
+            C = C_block;
+            // Get current block's row size.
+            block_size_i = nrow - i;
+            if (block_size_i > block_size) {
+                block_size_i = block_size;
+            }
+            // Perform element wise operation for block.
+            for (; A < A_end; A += nrow, C += nrow) {
+                for (j = 0; j < block_size_i; ++j) {
+                    C[j] = A[j] + v[j]; // FUN = '+'
+                }
+            }
+            // Step one block forth.
+            A_block += block_size_i;
+            C_block += block_size_i;
+            v += block_size_i;
+        }
+    } else if (*op == '-') {
+        // Iterate `(block_size_i, ncol)` submatrix blocks.
+        for (i = 0; i < nrow; i += block_size_i) {
+            // Set `A` and `C` to block beginning.
+            A = A_block;
+            C = C_block;
+            // Get current block's row size.
+            block_size_i = nrow - i;
+            if (block_size_i > block_size) {
+                block_size_i = block_size;
+            }
+            // Perform element wise operation for block.
+            for (; A < A_end; A += nrow, C += nrow) {
+                for (j = 0; j < block_size_i; ++j) {
+                    C[j] = A[j] - v[j]; // FUN = '-'
+                }
+            }
+            // Step one block forth.
+            A_block += block_size_i;
+            C_block += block_size_i;
+            v += block_size_i;
+        }
+    } else if (*op == '*') {
+        // Iterate `(block_size_i, ncol)` submatrix blocks.
+        for (i = 0; i < nrow; i += block_size_i) {
+            // Set `A` and `C` to block beginning.
+            A = A_block;
+            C = C_block;
+            // Get current block's row size.
+            block_size_i = nrow - i;
+            if (block_size_i > block_size) {
+                block_size_i = block_size;
+            }
+            // Perform element wise operation for block.
+            for (; A < A_end; A += nrow, C += nrow) {
+                for (j = 0; j < block_size_i; ++j) {
+                    C[j] = A[j] * v[j]; // FUN = '*'
+                }
+            }
+            // Step one block forth.
+            A_block += block_size_i;
+            C_block += block_size_i;
+            v += block_size_i;
+        }
+    } else if (*op == '/') {
+        // Iterate `(block_size_i, ncol)` submatrix blocks.
+        for (i = 0; i < nrow; i += block_size_i) {
+            // Set `A` and `C` to block beginning.
+            A = A_block;
+            C = C_block;
+            // Get current block's row size.
+            block_size_i = nrow - i;
+            if (block_size_i > block_size) {
+                block_size_i = block_size;
+            }
+            // Perform element wise operation for block.
+            for (; A < A_end; A += nrow, C += nrow) {
+                for (j = 0; j < block_size_i; ++j) {
+                    C[j] = A[j] / v[j]; // FUN = '/'
+                }
+            }
+            // Step one block forth.
+            A_block += block_size_i;
+            C_block += block_size_i;
+            v += block_size_i;
+        }
+    } else {
+        error("Got unknown 'op' (opperation) argument.");
+    }
+}
+
+static inline void matrixprod(const double *A, const int nrowA, const int ncolA,
+                              const double *B, const int nrowB, const int ncolB,
+                              double *C) {
+    const double one = 1.0;
+    const double zero = 0.0;
+
+    // DGEMM with parameterization:
+    //     C <- A %*% B
+    F77_NAME(dgemm)("N", "N", &nrowA, &ncolB, &ncolA,
+                    &one, A, &nrowA, B, &nrowB,
+                    &zero, C, &nrowA);
+}
+
+static inline void crossprod(const double *A, const int nrowA, const int ncolA,
+                             const double *B, const int nrowB, const int ncolB,
+                             double *C) {
+    const double one = 1.0;
+    const double zero = 0.0;
+
+    // DGEMM with parameterization:
+    //     C <- t(A) %*% B
+    F77_NAME(dgemm)("T", "N", &ncolA, &ncolB, &nrowA,
+                    &one, A, &nrowA, B, &nrowB,
+                    &zero, C, &ncolA);
+}
+
+static inline void nullProj(const double *B, const int nrowB, const int ncolB,
+                            double *Q) {
+    const double minusOne = -1.0;
+    const double one = 1.0;
+
+    // Initialize as identity matrix.
+    memset(Q, 0, sizeof(double) * nrowB * nrowB);
+    double *Q_diag, *Q_end = Q + nrowB * nrowB;
+    for (Q_diag = Q; Q_diag < Q_end; Q_diag += nrowB + 1) {
+        *Q_diag = 1.0;
+    }
+
+    // DGEMM with parameterization:
+    //     C <- (-1.0 * B %*% t(B)) + C
+    F77_NAME(dgemm)("N", "T", &nrowB, &nrowB, &ncolB,
+                    &minusOne, B, &nrowB, B, &nrowB,
+                    &one, Q, &nrowB);
+}
+
+static inline void rangePairs(const int from, const int to, int *pairs) {
+    int i, j;
+    for (i = from; i < to; ++i) {
+       for (j = i + 1; j < to; ++j) {
+            pairs[0] = i;
+            pairs[1] = j;
+            pairs += 2;
+        }
+    }
+}
+
+// A dence skwe-symmetric rank 2 update.
+// Perform the update
+//      C := alpha (A * B^T - B * A^T) + beta C
+static void skewSymRank2k(const int nrow, const int ncol,
+                          double alpha, const double *A, const double *B,
+                          double beta,
+                          double *C) {
+    F77_NAME(dgemm)("N", "T",
+             &nrow, &nrow, &ncol,
+             &alpha, A, &nrow, B, &nrow,
+             &beta, C, &nrow);
+
+    alpha *= -1.0;
+    beta = 1.0;
+    F77_NAME(dgemm)("N", "T",
+             &nrow, &nrow, &ncol,
+             &alpha, B, &nrow, A, &nrow,
+             &beta, C, &nrow);
+}
+
+// TODO: mutch potential for optimization!!!
+static inline void weightedYandLoss(const int n,
+                                    const double *Y,
+                                    const double *vecD,
+                                    const double *vecW,
+                                    const double *colSums,
+                                    double *y1, double *L, double *vecS,
+                                    double *const loss) {
+    int i, j, k, N = n * (n - 1) / 2;
+    double l;
+
+    for (i = 0; i < n; ++i) {
+        y1[i] = Y[i] / colSums[i];
+        L[i] = Y[i] * Y[i] / colSums[i];
+    }
+
+    for (k = j = 0; j < n; ++j) {
+        for (i = j + 1; i < n; ++i, ++k) {
+            y1[i] += Y[j] * vecW[k] / colSums[i];
+            y1[j] += Y[i] * vecW[k] / colSums[j];
+            L[i] += Y[j] * Y[j] * vecW[k] / colSums[i];
+            L[j] += Y[i] * Y[i] * vecW[k] / colSums[j];
+        }
+    }
+
+    l = 0.0;
+    for (i = 0; i < n; ++i) {
+        l += (L[i] -= y1[i] * y1[i]);
+    }
+    *loss = l / (double)n;
+
+    for (k = j = 0; j < n; ++j) {
+        for (i = j + 1; i < n; ++i, ++k) {
+            l = Y[j] - y1[i];
+            vecS[k]  = (L[i] - (l * l)) / colSums[i];
+            l = Y[i] - y1[j];
+            vecS[k] += (L[j] - (l * l)) / colSums[j];
+        }
+    }
+
+    for (k = 0; k < N; ++k) {
+        vecS[k] *= vecW[k] * vecD[k];
+    }
+}
+
+inline double gaussKernel(const double x, const double scale) {
+    return exp(scale * x * x);
+}
+
+static void gradient(const int n, const int p, const int q,
+                     const double *X,
+                     const double *X_diff,
+                     const double *Y,
+                     const double *V,
+                     const double h,
+                     double *G, double *const loss) {
+    // Number of X_i to X_j not trivial pairs.
+    int i, N = (n * (n - 1)) / 2;
+    double scale = -0.5 / h;
+    const double one = 1.0;
+
+    if (X_diff == (void*)0) {
+        // TODO: ...
+    }
+
+    // Allocate and compute projection matrix `Q = I_p - V * V^T`
+    double *Q = (double*)malloc(p * p * sizeof(double));
+    nullProj(V, p, q, Q);
+
+    // allocate and compute vectorized distance matrix with a temporary
+    // projection of `X_diff`.
+    double *vecD = (double*)malloc(N * sizeof(double));
+    double *X_proj;
+    if (p < 5) { // TODO: refine that!
+        X_proj = (double*)malloc(N * 5 * sizeof(double));
+    } else {
+        X_proj = (double*)malloc(N * p * sizeof(double));
+    }
+    matrixprod(X_diff, N, p, Q, p, p, X_proj);
+    rowSquareSums(X_proj, N, p, vecD);
+
+    // Apply kernel to distence vector for weights computation.
+    double *vecW = X_proj; // reuse memory area, no longer needed.
+    for (i = 0; i < N; ++i) {
+        vecW[i] = gaussKernel(vecD[i], scale);
+    }
+    double *colSums = X_proj + N; // still allocated!
+    rowSumsSymVec(vecW, n, &one, colSums); // rowSums = colSums cause Sym
+
+    // compute weighted responces of first end second momontum, aka y1, y2.
+    double *y1 = X_proj + N + n;
+    double *L = X_proj + N + (2 * n);
+    // Allocate X_diff scaling vector `vecS`, not in `X_proj` mem area because
+    // used symultanious to X_proj in final gradient computation.
+    double *vecS = (double*)malloc(N * sizeof(double));
+    weightedYandLoss(n, Y, vecD, vecW, colSums, y1, L, vecS, loss);
+
+    // compute the gradient using X_proj for intermidiate scaled X_diff.
+    rowSweep(X_diff, N, p, "*", vecS, X_proj);
+    // reuse Q which has the required dim (p, p).
+    crossprod(X_diff, N, p, X_proj, N, p, Q);
+    // Product with V
+    matrixprod(Q, p, p, V, p, q, G);
+    // And final scaling (TODO: move into matrixprod!)
+    scale = -2.0 / (((double)n) * h * h);
+    N = p * q;
+    for (i = 0; i < N; ++i) {
+        G[i] *= scale;
+    }
+
+    free(vecS);
+    free(X_proj);
+    free(vecD);
+    free(Q);
+}
--- a/wip.h
+++ b/wip.h
@ -0,0 +1,159 @@
+#ifndef _CVE_INCLUDE_GUARD_
+#define _CVE_INCLUDE_GUARD_
+
+#include <Rinternals.h>
+
+#define CVE_MEM_CHUNK_SMALL 1016
+#define CVE_MEM_CHUNK_SIZE 2032
+
+static inline void rowSums(const double *A,
+                           const int nrow, const int ncol,
+                           double *sum);
+SEXP R_rowSums(SEXP A) {
+    SEXP sums = PROTECT(allocVector(REALSXP, nrows(A)));
+
+    rowSums(REAL(A), nrows(A), ncols(A), REAL(sums));
+
+    UNPROTECT(1);
+    return sums;
+}
+
+static inline void colSums(const double *A,
+                           const int nrow, const int ncol,
+                           double *sum);
+SEXP R_colSums(SEXP A) {
+    SEXP sums = PROTECT(allocVector(REALSXP, ncols(A)));
+
+    colSums(REAL(A), nrows(A), ncols(A), REAL(sums));
+
+    UNPROTECT(1);
+    return sums;
+}
+
+static inline void rowSquareSums(const double*, const int, const int, double*);
+SEXP R_rowSquareSums(SEXP A) {
+    SEXP result = PROTECT(allocVector(REALSXP, nrows(A)));
+
+    rowSquareSums(REAL(A), nrows(A), ncols(A), REAL(result));
+
+    UNPROTECT(1);
+    return result;
+}
+
+static inline void rowSumsSymVec(const double *Avec, const int nrow,
+                                 const double *diag,
+                                 double *sum);
+SEXP R_rowSumsSymVec(SEXP Avec, SEXP nrow, SEXP diag) {
+    SEXP sum = PROTECT(allocVector(REALSXP, *INTEGER(nrow)));
+
+    rowSumsSymVec(REAL(Avec), *INTEGER(nrow), REAL(diag), REAL(sum));
+
+    UNPROTECT(1);
+    return sum;
+}
+
+static void rowSweep(const double *A, const int nrow, const int ncol,
+                     const char* op,
+                     const double *v, // vector of length nrow
+                     double *C);
+SEXP R_rowSweep(SEXP A, SEXP v, SEXP op) {
+    SEXP C = PROTECT(allocMatrix(REALSXP, nrows(A), ncols(A)));
+
+    rowSweep(REAL(A), nrows(A), ncols(A),
+             CHAR(STRING_ELT(op, 0)),
+             REAL(v), REAL(C));
+
+    UNPROTECT(1);
+    return C;
+}
+
+static inline void matrixprod(const double *A, const int nrowA, const int ncolA,
+                              const double *B, const int nrowB, const int ncolB,
+                              double *C);
+SEXP R_matrixprod(SEXP A, SEXP B) {
+    SEXP C = PROTECT(allocMatrix(REALSXP, nrows(A), ncols(B)));
+
+    matrixprod(REAL(A), nrows(A), ncols(A),
+               REAL(B), nrows(B), ncols(B),
+               REAL(C));
+
+    UNPROTECT(1);
+    return C;
+}
+
+static inline void crossprod(const double* A, const int nrowA, const int ncolA,
+                             const double* B, const int ncolB, const int nrowB,
+                             double* C);
+SEXP R_crossprod(SEXP A, SEXP B) {
+    SEXP C = PROTECT(allocMatrix(REALSXP, ncols(A), ncols(B)));
+
+    crossprod(REAL(A), nrows(A), ncols(A),
+              REAL(B), nrows(B), ncols(B),
+              REAL(C));
+
+    UNPROTECT(1);
+    return C;
+}
+
+static void skewSymRank2k(const int n, const int k,
+                          double alpha, const double *A, const double *B,
+                          double beta,
+                          double *C);
+SEXP R_skewSymRank2k(SEXP A, SEXP B, SEXP alpha, SEXP beta) {
+    SEXP C = PROTECT(allocMatrix(REALSXP, nrows(A), nrows(A)));
+    memset(REAL(C), 0, nrows(A) * nrows(A) * sizeof(double));
+
+    skewSymRank2k(nrows(A), ncols(A),
+                  *REAL(alpha), REAL(A), REAL(B),
+                  *REAL(beta), REAL(C));
+
+    UNPROTECT(1);
+    return C;
+}
+
+static inline void nullProj(const double* B, const int nrowB, const int ncolB,
+                            double* Q);
+SEXP R_nullProj(SEXP B) {
+    SEXP Q = PROTECT(allocMatrix(REALSXP, nrows(B), nrows(B)));
+
+    nullProj(REAL(B), nrows(B), ncols(B), REAL(Q));
+
+    UNPROTECT(1);
+    return Q;
+}
+
+static inline void rangePairs(const int from, const int to, int *pairs);
+SEXP R_rangePairs(SEXP from, SEXP to) {
+    int start = asInteger(from);
+    int end = asInteger(to) + 1;
+    int n = end - start;
+
+    SEXP out = PROTECT(allocMatrix(INTSXP, 2, n * (n - 1) / 2));
+    rangePairs(start, end, INTEGER(out));
+
+    UNPROTECT(1);
+    return out;
+}
+
+static void gradient(const int n, const int p, const int q,
+                     const double *X,
+                     const double *X_diff,
+                     const double *Y,
+                     const double *V,
+                     const double h,
+                     double *G, double *const loss);
+SEXP R_gradient(SEXP X, SEXP X_diff, SEXP Y, SEXP V, SEXP h) {
+    int N = (nrows(X) * (nrows(X) - 1)) / 2;
+
+    SEXP G = PROTECT(allocMatrix(REALSXP, nrows(V), ncols(V)));
+    SEXP loss = PROTECT(allocVector(REALSXP, 1));
+
+    gradient(nrows(X), ncols(X), ncols(V),
+             REAL(X), REAL(X_diff), REAL(Y), REAL(V), *REAL(h),
+             REAL(G), REAL(loss));
+
+    UNPROTECT(2);
+    return G;
+}
+
+#endif /* _CVE_INCLUDE_GUARD_ */