From 4b68c245a686a94a452aeefeab183d9d0668a190 Mon Sep 17 00:00:00 2001
From: daniel <daniel@kapla.at>
Date: Thu, 5 Dec 2019 17:35:29 +0100
Subject: [PATCH] rewrote (C): to use new Gradient formula, rewrote (C):
 subroutine interface to use matrix struct

---
 CVE_C/NAMESPACE                 |    3 +-
 CVE_C/R/CVE.R                   |  109 +++-
 CVE_C/R/coef.R                  |   33 +-
 CVE_C/R/directions.R            |   18 +
 CVE_C/R/estimateBandwidth.R     |   19 +
 CVE_C/R/plot.R                  |   25 +
 CVE_C/R/predict.R               |   26 +
 CVE_C/R/predict_dim.R           |   27 +-
 CVE_C/R/summary.R               |   19 +
 CVE_C/man/coef.cve.Rd           |   33 +-
 CVE_C/man/cve.Rd                |   63 +-
 CVE_C/man/cve.call.Rd           |   34 +-
 CVE_C/man/directions.cve.Rd     |   19 +
 CVE_C/man/estimate.bandwidth.Rd |   20 +
 CVE_C/man/plot.cve.Rd           |   27 +
 CVE_C/man/predict.cve.Rd        |   27 +
 CVE_C/man/predict.dim.Rd        |   24 -
 CVE_C/man/summary.cve.Rd        |   19 +
 CVE_C/src/callLogger.c          |   22 +-
 CVE_C/src/cve.c                 |  265 ++++----
 CVE_C/src/cve.h                 |  167 +++--
 CVE_C/src/cve_subroutines.c     |  320 +++++++---
 CVE_C/src/export.c              |   71 ++-
 CVE_C/src/init.c                |   16 +-
 CVE_C/src/matrix.c              | 1053 +++++++++++++++++++++++++++----
 CVE_C/src/rStiefel.c            |   89 +--
 CVE_C/src/rowColOp.c            |  160 -----
 CVE_C/src/sweep.c               |   51 --
 README.md                       |    4 +-
 runtime_test.R                  |    6 +-
 test.R                          |    4 +-
 31 files changed, 1915 insertions(+), 858 deletions(-)
 delete mode 100644 CVE_C/man/predict.dim.Rd
 delete mode 100644 CVE_C/src/rowColOp.c
 delete mode 100644 CVE_C/src/sweep.c

diff --git a/CVE_C/NAMESPACE b/CVE_C/NAMESPACE
index d2c0f0f..0f9e2ff 100644
--- a/CVE_C/NAMESPACE
+++ b/CVE_C/NAMESPACE
@@ -4,7 +4,6 @@ S3method(coef,cve)
 S3method(directions,cve)
 S3method(plot,cve)
 S3method(predict,cve)
-S3method(predict.dim,cve)
 S3method(summary,cve)
 export(cve)
 export(cve.call)
@@ -13,7 +12,7 @@ export(directions)
 export(elem.pairs)
 export(estimate.bandwidth)
 export(null)
-export(predict.dim)
+export(predict_dim)
 export(projTangentStiefel)
 export(rStiefel)
 export(retractStiefel)
diff --git a/CVE_C/R/CVE.R b/CVE_C/R/CVE.R
index 0a7a35d..8246733 100644
--- a/CVE_C/R/CVE.R
+++ b/CVE_C/R/CVE.R
@@ -45,6 +45,7 @@
 #'          below.
 #'      \item "weighted" variation with addaptive weighting of slices.
 #' }
+#' @param max.dim upper bounds for \code{k}, (ignored if \code{k} is supplied).
 #' @param ... Parameters passed on to \code{cve.call}.
 #'
 #' @return an S3 object of class \code{cve} with components:
@@ -58,19 +59,54 @@
 #' }
 #'
 #' @examples
-#' # create dataset
-#' x <- matrix(rnorm(400), 100, 4)
-#' y <- x[, 1] + x[, 2] + as.matrix(rnorm(100))
-#'
-#' # Call CVE.
-#' dr <- cve(y ~ x)
-#' # Call weighted CVE.
-#' dr.weighted <- cve(y ~ x, method = "weighted")
-#'
-#' # Training data responces of reduced data.
-#' y.est <- directions(dr, 1)
-#' # Extract SDR subspace basis of dimension 1.
-#' B <- coef(dr.momentum, 1)
+#' # set dimensions for simulation model
+#' p <- 8
+#' k <- 2
+#' # create B for simulation
+#' b1 <- rep(1 / sqrt(p), p)
+#' b2 <- (-1)^seq(1, p) / sqrt(p)
+#' B <- cbind(b1, b2)
+#' # samplsize
+#' n <- 200
+#' set.seed(21)
+#' # creat predictor data x ~ N(0, I_p)
+#' x <- matrix(rnorm(n * p), n, p)
+#' # simulate response variable
+#' #     y = f(B'x) + err
+#' # with f(x1, x2) = x1^2 + 2 * x2 and err ~ N(0, 0.25^2)
+#' y <- (x %*% b1)^2 + 2 * (x %*% b2) + 0.25 * rnorm(100)
+#' # calculate cve with method 'simple' for k unknown in 1, ..., 4
+#' cve.obj.s <- cve(y ~ x, max.dim = 4) # default method 'simple'
+#' # calculate cve with method 'weighed' for k = 2
+#' cve.obj.w <- cve(y ~ x, k = 2, method = 'weighted')
+#' # estimate dimension from cve.obj.s
+#' khat <- predict_dim(cve.obj.s)$k
+#' # get cve-estimate for B with dimensions (p, k = khat)
+#' B2 <- coef(cve.obj.s, k = khat)
+#' # get projected X data (same as cve.obj.s$X %*% B2)
+#' proj.X <- directions(cve.obj.s, k = khat)
+#' #  plot y against projected data
+#' plot(proj.X[, 1], y)
+#' plot(proj.X[, 2], y)
+#' # creat 10 new x points and y according to model
+#' x.new <- matrix(rnorm(10 * p), 10, p)
+#' y.new <- (x.new %*% b1)^2 + 2 * (x.new %*% b2) + 0.25 * rnorm(10)
+#' # predict y.new
+#' yhat <- predict(cve.obj.s, x.new, khat)
+#' plot(y.new, yhat)
+#' # projection matrix on span(B)
+#' # same as B %*% t(B) since B is semi-orthogonal
+#' PB <- B %*% solve(t(B) %*% B) %*% t(B)
+#' # cve estimates for B with simple and weighted method
+#' B.s <- coef(cve.obj.s, k = 2)
+#' B.w <- coef(cve.obj.w, k = 2)
+#' # same as B.s %*% t(B.s) since B.s is semi-orthogonal (same vor B.w)
+#' PB.s <- B.s %*% solve(t(B.s) %*% B.s) %*% t(B.s)
+#' PB.w <- B.w %*% solve(t(B.w) %*% B.w) %*% t(B.w)
+#' # compare estimation accuracy of simple and weighted cve estimate by
+#' # Frobenius norm of difference of projections.
+#' norm(PB - PB.s, type = 'F')
+#' norm(PB - PB.w, type = 'F')
 #'
 #' @seealso For a detailed description of \code{formula} see
 #'      \code{\link{formula}}.
@@ -107,6 +143,12 @@ cve <- function(formula, data, method = "simple", max.dim = 10L, ...) {
 #'   \code{\link{estimate.bandwidth}} (ignored if \code{h} is supplied).
 #' @param X data matrix with samples in its rows.
 #' @param Y Responses (1 dimensional).
+#' @param method specifies the CVE method variation as one of
+#' \itemize{
+#'      \item "simple" exact implementation as described in the paper listed 
+#'          below.
+#'      \item "weighted" variation with addaptive weighting of slices.
+#' }
 #' @param k Dimension of lower dimensional projection, if \code{k} is given
 #'      only the specified dimension \code{B} matrix is estimated.
 #' @param min.dim lower bounds for \code{k}, (ignored if \code{k} is supplied).
@@ -124,28 +166,35 @@ cve <- function(formula, data, method = "simple", max.dim = 10L, ...) {
 #' @param slack Positive scaling to allow small increases of the loss while
 #'      optimizing.
 #' @param gamma step-size reduction multiple.
-#' @param V.init Semi-orthogonal matrix of dimensions `(ncol(X), ncol(X) - k)` #'      as optimization starting value. (If supplied, \code{attempts} is
+#' @param V.init Semi-orthogonal matrix of dimensions `(ncol(X), ncol(X) - k)
+#'      as optimization starting value. (If supplied, \code{attempts} is
 #'      set to 1 and \code{k} to match dimension)
 #'
 #' @inherit cve return
 #'
 #' @examples
-#' # Create a dataset (n samples):
-#' n <- 100
-#' X <- matrix(rnorm(4 * n), n)
-#' Y <- matrix(X[, 1] + cos(X[, 2]) + rnorm(n, 0, .1), n)
+#' # create B for simulation (k = 1)
+#' B <- rep(1, 5) / sqrt(5)
 #'
-#' # Create logger function:
-#' logger <- function(attempt, iter, data) {
-#'     if (iter == 0) {
-#'        cat("Starting attempt nr:", attempt, "\n")
-#'    }
-#'    cat("  iter:", iter, "loss:", data$loss, "\n")
-#' }
-#'
-#' Call 'cve' with logger:
-#' cve(Y ~ X, logger = logger)
+#' set.seed(21)
+#' # creat predictor data X ~ N(0, I_p)
+#' X <- matrix(rnorm(500), 100, 5)
+#' # simulate response variable
+#' #     Y = f(B'X) + err
+#' # with f(x1) = x1 and err ~ N(0, 0.25^2)
+#' Y <- X %*% B + 0.25 * rnorm(100)
+#' 
+#' # calculate cve with method 'simple' for k = 1
+#' set.seed(21)
+#' cve.obj.simple1 <- cve(Y ~ X, k = 1)
+#' 
+#' # same as
+#' set.seed(21)
+#' cve.obj.simple2 <- cve.call(X, Y, k = 1)
 #'
+#' # extract estimated B's.
+#' coef(cve.obj.simple1, k = 1)
+#' coef(cve.obj.simple2, k = 1)
 #' @export
 cve.call <- function(X, Y, method = "simple",
                      nObs = sqrt(nrow(X)), h = NULL,
@@ -178,7 +227,7 @@ cve.call <- function(X, Y, method = "simple",
     }
 
     if (!(is.matrix(X) && is.numeric(X))) {
-        stop("Parameter 'X' should be a numeric matrices.")
+        stop("Parameter 'X' should be a numeric matrix.")
     }
     if (!is.numeric(Y)) {
         stop("Parameter 'Y' must be numeric.")
@@ -288,7 +337,7 @@ cve.call <- function(X, Y, method = "simple",
             h <- estimate.bandwidth(X, k, nObs)
         }
 
-        dr.k <- .Call('cve', PACKAGE = 'CVE',
+        dr.k <- .Call('cve_export', PACKAGE = 'CVE',
                       X, Y, k, h,
                       method_nr,
                       V.init,
diff --git a/CVE_C/R/coef.R b/CVE_C/R/coef.R
index a5b2ad6..6d71308 100644
--- a/CVE_C/R/coef.R
+++ b/CVE_C/R/coef.R
@@ -9,10 +9,35 @@
 #' @return dir the matrix of CS or CMS of given dimension
 #'
 #' @examples
-#' x <- matrix(rnorm(400),100,4)
-#' y <- x[, 1] + x[, 2] + as.matrix(rnorm(100))
-#' dr <- cve(y ~ x, k = 2) # Only for sub-space dim. 2
-#' B2 <- coef(dr, 2)
+#' # set dimensions for simulation model
+#' p <- 8 # sample dimension
+#' k <- 2 # real dimension of SDR subspace
+#' n <- 200 # samplesize
+#' # create B for simulation
+#' b1 <- rep(1 / sqrt(p), p)
+#' b2 <- (-1)^seq(1, p) / sqrt(p)
+#' B <- cbind(b1, b2)
+#' 	
+#' set.seed(21)
+#' # creat predictor data x ~ N(0, I_p)
+#' x <- matrix(rnorm(n * p), n, p)
+#' # simulate response variable
+#' #    y = f(B'x) + err
+#' # with f(x1, x2) = x1^2 + 2 * x2 and err ~ N(0, 0.25^2)
+#' y <- (x %*% b1)^2 + 2 * (x %*% b2) + 0.25 * rnorm(100)
+#' # calculate cve for k = 1, ..., 5
+#' cve.obj <- cve(y ~ x, max.dim = 5)
+#' # get cve-estimate for B with dimensions (p, k = 2)
+#' B2 <- coef(cve.obj, k = 2)
+#' 	
+#' # Projection matrix on span(B)
+#' # equivalent to `B %*% t(B)` since B is semi-orthonormal
+#' PB <- B %*% solve(t(B) %*% B) %*% t(B)
+#' # Projection matrix on span(B2)
+#' # equivalent to `B2 %*% t(B2)` since B2 is semi-orthonormal
+#' PB2 <- B2 %*% solve(t(B2) %*% B2) %*% t(B2)
+#' # compare estimation accuracy by Frobenius norm of difference of projections
+#' norm(PB - PB2, type = 'F')
 #'
 #' @method coef cve
 #' @aliases coef.cve
diff --git a/CVE_C/R/directions.R b/CVE_C/R/directions.R
index 1a9d76d..4546539 100644
--- a/CVE_C/R/directions.R
+++ b/CVE_C/R/directions.R
@@ -8,6 +8,24 @@ directions <- function(dr, k) {
 #' @param dr Instance of 'cve' as returned by \code{cve}.
 #' @param k SDR dimension to use for projection.
 #'
+#' @examples
+#' # create B for simulation (k = 1)
+#' B <- rep(1, 5) / sqrt(5)
+#' set.seed(21)
+#' # creat predictor data x ~ N(0, I_p)
+#' x <- matrix(rnorm(500), 100, 5)
+#' # simulate response variable
+#' #    y = f(B'x) + err
+#' # with f(x1) = x1 and err ~ N(0, 0.25^2)
+#' y <- x %*% B + 0.25 * rnorm(100)
+#' # calculate cve with method 'simple' for k = 1
+#' set.seed(21)
+#' cve.obj.simple <- cve(y ~ x, k = 1, method = 'simple')
+#' # get projected data for k = 1
+#' x.proj <- directions(cve.obj.simple, k = 1)
+#' # plot y against projected data
+#' plot(x.proj, y)
+#'
 #' @method directions cve
 #' @aliases directions directions.cve
 #' @export
diff --git a/CVE_C/R/estimateBandwidth.R b/CVE_C/R/estimateBandwidth.R
index d358f40..15543dc 100644
--- a/CVE_C/R/estimateBandwidth.R
+++ b/CVE_C/R/estimateBandwidth.R
@@ -14,6 +14,25 @@
 #'
 #' @return Estimated bandwidth \code{h}.
 #'
+#' @examples
+#' # set dimensions for simulation model
+#' p <- 5; k <- 1
+#' # create B for simulation
+#' B <- rep(1, p) / sqrt(p)
+#' # samplsize
+#' n <- 100
+#' set.seed(21)
+#' #creat predictor data x ~ N(0, I_p)
+#' x <- matrix(rnorm(n * p), n, p)
+#' # simulate response variable
+#' #     y = f(B'x) + err
+#' # with f(x1) = x1 and err ~ N(0, 0.25^2)
+#' y <- x %*% B + 0.25 * rnorm(100)
+#' # calculate cve with method 'simple' for k = 1
+#' set.seed(21)
+#' cve.obj.simple <- cve(y ~ x, k = k)
+#' print(cve.obj.simple$res$'1'$h)
+#' print(estimate.bandwidth(x, k = k))
 #' @export
 estimate.bandwidth <- function(X, k, nObs) {
     n <- nrow(X)
diff --git a/CVE_C/R/plot.R b/CVE_C/R/plot.R
index 96bd479..d981fff 100644
--- a/CVE_C/R/plot.R
+++ b/CVE_C/R/plot.R
@@ -5,6 +5,31 @@
 #' @param x Object of class \code{"cve"} (result of [\code{\link{cve}}]).
 #' @param ... Pass through parameters to [\code{\link{plot}}] and
 #'      [\code{\link{lines}}]
+#' @examples
+#' # create B for simulation
+#' B <- cbind(rep(1, 6), (-1)^seq(6)) / sqrt(6)
+#' 
+#' set.seed(21)
+#' # creat predictor data x ~ N(0, I_p)
+#' X <- matrix(rnorm(600), 100)
+#' 
+#' # simulate response variable
+#' #    y = f(B'x) + err
+#' # with f(x1, x2) = x1^2 + 2 x2 and err ~ N(0, 0.25^2)
+#' Y <- (X %*% B[, 1])^2 + 2 * X %*% B[, 2] + rnorm(100, 0, .1)
+#'
+#' # Create bandwidth estimation function
+#' estimate.bandwidth <- function(X, k, nObs) {
+#'     n <- nrow(X)
+#'     p <- ncol(X)
+#'     X_c <- scale(X, center = TRUE, scale = FALSE)
+#'     2 * qchisq((nObs - 1) / (n - 1), k) * sum(X_c^2) / (n * p)
+#' }
+#' # calculate cve with method 'simple' for k = min.dim,...,max.dim
+#' cve.obj.simple <- cve(Y ~ X, h = estimate.bandwidth, nObs = sqrt(nrow(X)))
+#' 
+#' # elbow plot
+#' plot(cve.obj.simple)
 #'
 #' @seealso see \code{\link{par}} for graphical parameters to pass through
 #'      as well as \code{\link{plot}}, the standard plot utility.
diff --git a/CVE_C/R/predict.R b/CVE_C/R/predict.R
index 074db6e..f882d27 100644
--- a/CVE_C/R/predict.R
+++ b/CVE_C/R/predict.R
@@ -10,6 +10,32 @@
 #'
 #' @return prediced response of data \code{newdata}.
 #'
+#' @examples
+#' # create B for simulation
+#' B <- rep(1, 5) / sqrt(5)
+#'
+#' set.seed(21)
+#' # creat predictor data x ~ N(0, I_p)
+#' x <- matrix(rnorm(500), 100)
+#'
+#' # simulate response variable
+#' #    y = f(B'x) + err
+#' # with f(x1) = x1 and err ~ N(0, 0.25^2)
+#' y <- x %*% B + 0.25 * rnorm(100)
+#'
+#' x.train <- x[1:80, ]
+#' x.test  <- x[81:100, ]
+#' y.train <- y[1:80, ]
+#' y.test  <- y[81:100, ]
+#'
+#' # calculate cve with method 'simple' for k = 1
+#' cve.obj.simple <- cve(y.train ~ x.train, k = 1) 
+#'
+#' # predict y.test from x.test
+#' yhat <- predict(cve.obj.simple, x.test, 1)
+#'
+#' # plot prediction against y.test
+#' plot(yhat, y.test)
 #' @seealso \code{\link{cve}}, \code{\link{cve.call}} or \pkg{\link{mars}}.
 #'
 #' @rdname predict.cve
diff --git a/CVE_C/R/predict_dim.R b/CVE_C/R/predict_dim.R
index 5b1396c..5784f92 100644
--- a/CVE_C/R/predict_dim.R
+++ b/CVE_C/R/predict_dim.R
@@ -10,15 +10,26 @@
 #'    \item k: predicted dimensions.
 #' }
 #'
+#' @examples
+#' # create B for simulation
+#' B <- rep(1, 5) / sqrt(5)
+#' 
+#' set.seed(21)
+#' # creat predictor data x ~ N(0, I_p)
+#' x <- matrix(rnorm(500), 100)
+#' 
+#' # simulate response variable
+#' #    y = f(B'x) + err
+#' # with f(x1) = x1 and err ~ N(0, 0.25^2)
+#' y <- x %*% B + 0.25 * rnorm(100)
+#'
+#' # Calculate cve for unknown k between min.dim and max.dim.
+#' cve.obj.simple <- cve(y ~ x) 
+#'
+#' predict_dim(cve.obj.simple)
+#'
 #' @export
-predict.dim <- function(object, ...) {
-    UseMethod("predict.dim")
-}
-
-#' @aliases predict.dim
-#' @method predict.dim cve
-#' @export
-predict.dim.cve <- function(object, ...) {
+predict_dim <- function(object, ...) {
     # Get centered training data and dimensions
     X <- scale(object$X, center = TRUE, scale = FALSE)
     n <- nrow(object$X) # umber of training data samples
diff --git a/CVE_C/R/summary.R b/CVE_C/R/summary.R
index 9403f09..a50d6d6 100644
--- a/CVE_C/R/summary.R
+++ b/CVE_C/R/summary.R
@@ -1,6 +1,25 @@
 #' Prints a summary of a \code{cve} result.
 #' @param object Instance of 'cve' as returned by \code{cve}.
 #' @param ... ignored.
+#'
+#' @examples
+#' # create B for simulation
+#' B <- rep(1, 5) / sqrt(5)
+#'
+#' set.seed(21)
+#' #creat predictor data x ~ N(0, I_p)
+#' x <- matrix(rnorm(500), 100)
+#'
+#' # simulate response variable
+#' #    y = f(B'x) + err
+#' # with f(x1) = x1 and err ~ N(0, 0.25^2)
+#' y <- x %*% B + 0.25 * rnorm(100)
+#' 
+#' # calculate cve for unknown k between min.dim and max.dim.
+#' cve.obj.simple <- cve(y ~ x)
+#' 
+#' summary(cve.obj.simple)
+#'
 #' @method summary cve
 #' @export
 summary.cve <- function(object, ...) {
diff --git a/CVE_C/man/coef.cve.Rd b/CVE_C/man/coef.cve.Rd
index 54a68c3..6ca80b0 100644
--- a/CVE_C/man/coef.cve.Rd
+++ b/CVE_C/man/coef.cve.Rd
@@ -21,9 +21,34 @@ dir the matrix of CS or CMS of given dimension
 Returns the SDR basis matrix for SDR dimension(s).
 }
 \examples{
-x <- matrix(rnorm(400),100,4)
-y <- x[, 1] + x[, 2] + as.matrix(rnorm(100))
-dr <- cve(y ~ x, k = 2) # Only for sub-space dim. 2
-B2 <- coef(dr, 2)
+# set dimensions for simulation model
+p <- 8 # sample dimension
+k <- 2 # real dimension of SDR subspace
+n <- 200 # samplesize
+# create B for simulation
+b1 <- rep(1 / sqrt(p), p)
+b2 <- (-1)^seq(1, p) / sqrt(p)
+B <- cbind(b1, b2)
+	
+set.seed(21)
+# creat predictor data x ~ N(0, I_p)
+x <- matrix(rnorm(n * p), n, p)
+# simulate response variable
+#    y = f(B'x) + err
+# with f(x1, x2) = x1^2 + 2 * x2 and err ~ N(0, 0.25^2)
+y <- (x \%*\% b1)^2 + 2 * (x \%*\% b2) + 0.25 * rnorm(100)
+# calculate cve for k = 1, ..., 5
+cve.obj <- cve(y ~ x, max.dim = 5)
+# get cve-estimate for B with dimensions (p, k = 2)
+B2 <- coef(cve.obj, k = 2)
+	
+# Projection matrix on span(B)
+# equivalent to `B \%*\% t(B)` since B is semi-orthonormal
+PB <- B \%*\% solve(t(B) \%*\% B) \%*\% t(B)
+# Projection matrix on span(B2)
+# equivalent to `B2 \%*\% t(B2)` since B2 is semi-orthonormal
+PB2 <- B2 \%*\% solve(t(B2) \%*\% B2) \%*\% t(B2)
+# compare estimation accuracy by Frobenius norm of difference of projections
+norm(PB - PB2, type = 'F')
 
 }
diff --git a/CVE_C/man/cve.Rd b/CVE_C/man/cve.Rd
index 0ba8a05..4e9a159 100644
--- a/CVE_C/man/cve.Rd
+++ b/CVE_C/man/cve.Rd
@@ -20,6 +20,8 @@ supplied.}
      \item "weighted" variation with addaptive weighting of slices.
 }}
 
+\item{max.dim}{upper bounds for \code{k}, (ignored if \code{k} is supplied).}
+
 \item{...}{Parameters passed on to \code{cve.call}.}
 }
 \value{
@@ -58,19 +60,54 @@ a real \eqn{p \times k}{p x k} of rank \eqn{k <= p}{k \leq p}.
 Without loss of generality \eqn{B} is assumed to be orthonormal.
 }
 \examples{
-# create dataset
-x <- matrix(rnorm(400), 100, 4)
-y <- x[, 1] + x[, 2] + as.matrix(rnorm(100))
-
-# Call CVE.
-dr <- cve(y ~ x)
-# Call weighted CVE.
-dr.weighted <- cve(y ~ x, method = "weighted")
-
-# Training data responces of reduced data.
-y.est <- directions(dr, 1)
-# Extract SDR subspace basis of dimension 1.
-B <- coef(dr.momentum, 1)
+# set dimensions for simulation model
+p <- 8
+k <- 2
+# create B for simulation
+b1 <- rep(1 / sqrt(p), p)
+b2 <- (-1)^seq(1, p) / sqrt(p)
+B <- cbind(b1, b2)
+# samplsize
+n <- 200
+set.seed(21)
+# creat predictor data x ~ N(0, I_p)
+x <- matrix(rnorm(n * p), n, p)
+# simulate response variable
+#     y = f(B'x) + err
+# with f(x1, x2) = x1^2 + 2 * x2 and err ~ N(0, 0.25^2)
+y <- (x \%*\% b1)^2 + 2 * (x \%*\% b2) + 0.25 * rnorm(100)
+# calculate cve with method 'simple' for k unknown in 1, ..., 4
+cve.obj.s <- cve(y ~ x, max.dim = 4) # default method 'simple'
+# calculate cve with method 'weighed' for k = 2
+cve.obj.w <- cve(y ~ x, k = 2, method = 'weighted')
+# estimate dimension from cve.obj.s
+khat <- predict_dim(cve.obj.s)$k
+# get cve-estimate for B with dimensions (p, k = khat)
+B2 <- coef(cve.obj.s, k = khat)
+# get projected X data (same as cve.obj.s$X \%*\% B2)
+proj.X <- directions(cve.obj.s, k = khat)
+#  plot y against projected data
+plot(proj.X[, 1], y)
+plot(proj.X[, 2], y)
+# creat 10 new x points and y according to model
+x.new <- matrix(rnorm(10 * p), 10, p)
+y.new <- (x.new \%*\% b1)^2 + 2 * (x.new \%*\% b2) + 0.25 * rnorm(10)
+# predict y.new
+yhat <- predict(cve.obj.s, x.new, khat)
+plot(y.new, yhat)
+# projection matrix on span(B)
+# same as B \%*\% t(B) since B is semi-orthogonal
+PB <- B \%*\% solve(t(B) \%*\% B) \%*\% t(B)
+# cve estimates for B with simple and weighted method
+B.s <- coef(cve.obj.s, k = 2)
+B.w <- coef(cve.obj.w, k = 2)
+# same as B.s \%*\% t(B.s) since B.s is semi-orthogonal (same vor B.w)
+PB.s <- B.s \%*\% solve(t(B.s) \%*\% B.s) \%*\% t(B.s)
+PB.w <- B.w \%*\% solve(t(B.w) \%*\% B.w) \%*\% t(B.w)
+# compare estimation accuracy of simple and weighted cve estimate by
+# Frobenius norm of difference of projections.
+norm(PB - PB.s, type = 'F')
+norm(PB - PB.w, type = 'F')
 
 }
 \references{
diff --git a/CVE_C/man/cve.call.Rd b/CVE_C/man/cve.call.Rd
index d5e5076..9b6f0d4 100644
--- a/CVE_C/man/cve.call.Rd
+++ b/CVE_C/man/cve.call.Rd
@@ -14,6 +14,13 @@ cve.call(X, Y, method = "simple", nObs = sqrt(nrow(X)), h = NULL,
 
 \item{Y}{Responses (1 dimensional).}
 
+\item{method}{specifies the CVE method variation as one of
+\itemize{
+     \item "simple" exact implementation as described in the paper listed 
+         below.
+     \item "weighted" variation with addaptive weighting of slices.
+}}
+
 \item{nObs}{parameter for choosing bandwidth \code{h} using
 \code{\link{estimate.bandwidth}} (ignored if \code{h} is supplied).}
 
@@ -39,7 +46,8 @@ optimizing.}
 
 \item{gamma}{step-size reduction multiple.}
 
-\item{V.init}{Semi-orthogonal matrix of dimensions `(ncol(X), ncol(X) - k)` #'      as optimization starting value. (If supplied, \code{attempts} is
+\item{V.init}{Semi-orthogonal matrix of dimensions `(ncol(X), ncol(X) - k)
+as optimization starting value. (If supplied, \code{attempts} is
 set to 1 and \code{k} to match dimension)}
 
 \item{max.iter}{maximum number of optimization steps.}
@@ -84,3 +92,27 @@ and \eqn{B = (b_1, ..., b_k)} is
 a real \eqn{p \times k}{p x k} of rank \eqn{k <= p}{k \leq p}. 
 Without loss of generality \eqn{B} is assumed to be orthonormal.
 }
+\examples{
+# create B for simulation (k = 1)
+B <- rep(1, 5) / sqrt(5)
+
+set.seed(21)
+# creat predictor data X ~ N(0, I_p)
+X <- matrix(rnorm(500), 100, 5)
+# simulate response variable
+#     Y = f(B'X) + err
+# with f(x1) = x1 and err ~ N(0, 0.25^2)
+Y <- X \%*\% B + 0.25 * rnorm(100)
+
+# calculate cve with method 'simple' for k = 1
+set.seed(21)
+cve.obj.simple1 <- cve(Y ~ X, k = 1)
+
+# same as
+set.seed(21)
+cve.obj.simple2 <- cve.call(X, Y, k = 1)
+
+# extract estimated B's.
+coef(cve.obj.simple1, k = 1)
+coef(cve.obj.simple2, k = 1)
+}
diff --git a/CVE_C/man/directions.cve.Rd b/CVE_C/man/directions.cve.Rd
index 3fe5bca..3abd2db 100644
--- a/CVE_C/man/directions.cve.Rd
+++ b/CVE_C/man/directions.cve.Rd
@@ -15,3 +15,22 @@
 \description{
 Computes projected training data \code{X} for given dimension `k`.
 }
+\examples{
+# create B for simulation (k = 1)
+B <- rep(1, 5) / sqrt(5)
+set.seed(21)
+# creat predictor data x ~ N(0, I_p)
+x <- matrix(rnorm(500), 100, 5)
+# simulate response variable
+#    y = f(B'x) + err
+# with f(x1) = x1 and err ~ N(0, 0.25^2)
+y <- x \%*\% B + 0.25 * rnorm(100)
+# calculate cve with method 'simple' for k = 1
+set.seed(21)
+cve.obj.simple <- cve(y ~ x, k = 1, method = 'simple')
+# get projected data for k = 1
+x.proj <- directions(cve.obj.simple, k = 1)
+# plot y against projected data
+plot(x.proj, y)
+
+}
diff --git a/CVE_C/man/estimate.bandwidth.Rd b/CVE_C/man/estimate.bandwidth.Rd
index 3c02a0e..aa0b96e 100644
--- a/CVE_C/man/estimate.bandwidth.Rd
+++ b/CVE_C/man/estimate.bandwidth.Rd
@@ -25,3 +25,23 @@ with \eqn{n} the sample size, \eqn{p} its dimension
 (\code{n <- nrow(X); p <- ncol(X)}) and the covariance-matrix \eqn{\Sigma}
 which is \code{(n-1)/n} times the sample covariance estimate.
 }
+\examples{
+# set dimensions for simulation model
+p <- 5; k <- 1
+# create B for simulation
+B <- rep(1, p) / sqrt(p)
+# samplsize
+n <- 100
+set.seed(21)
+#creat predictor data x ~ N(0, I_p)
+x <- matrix(rnorm(n * p), n, p)
+# simulate response variable
+#     y = f(B'x) + err
+# with f(x1) = x1 and err ~ N(0, 0.25^2)
+y <- x \%*\% B + 0.25 * rnorm(100)
+# calculate cve with method 'simple' for k = 1
+set.seed(21)
+cve.obj.simple <- cve(y ~ x, k = k)
+print(cve.obj.simple$res$'1'$h)
+print(estimate.bandwidth(x, k = k))
+}
diff --git a/CVE_C/man/plot.cve.Rd b/CVE_C/man/plot.cve.Rd
index 4f795e1..8893782 100644
--- a/CVE_C/man/plot.cve.Rd
+++ b/CVE_C/man/plot.cve.Rd
@@ -14,6 +14,33 @@
 }
 \description{
 Boxplots of the loss from \code{min.dim} to \code{max.dim} \code{k} values.
+}
+\examples{
+# create B for simulation
+B <- cbind(rep(1, 6), (-1)^seq(6)) / sqrt(6)
+
+set.seed(21)
+# creat predictor data x ~ N(0, I_p)
+X <- matrix(rnorm(600), 100)
+
+# simulate response variable
+#    y = f(B'x) + err
+# with f(x1, x2) = x1^2 + 2 x2 and err ~ N(0, 0.25^2)
+Y <- (X \%*\% B[, 1])^2 + 2 * X \%*\% B[, 2] + rnorm(100, 0, .1)
+
+# Create bandwidth estimation function
+estimate.bandwidth <- function(X, k, nObs) {
+    n <- nrow(X)
+    p <- ncol(X)
+    X_c <- scale(X, center = TRUE, scale = FALSE)
+    2 * qchisq((nObs - 1) / (n - 1), k) * sum(X_c^2) / (n * p)
+}
+# calculate cve with method 'simple' for k = min.dim,...,max.dim
+cve.obj.simple <- cve(Y ~ X, h = estimate.bandwidth, nObs = sqrt(nrow(X)))
+
+# elbow plot
+plot(cve.obj.simple)
+
 }
 \seealso{
 see \code{\link{par}} for graphical parameters to pass through
diff --git a/CVE_C/man/predict.cve.Rd b/CVE_C/man/predict.cve.Rd
index 4660cd4..7998f47 100644
--- a/CVE_C/man/predict.cve.Rd
+++ b/CVE_C/man/predict.cve.Rd
@@ -22,6 +22,33 @@ prediced response of data \code{newdata}.
 \description{
 Predict responces using reduced data with \code{\link{mars}}.
 }
+\examples{
+# create B for simulation
+B <- rep(1, 5) / sqrt(5)
+
+set.seed(21)
+# creat predictor data x ~ N(0, I_p)
+x <- matrix(rnorm(500), 100)
+
+# simulate response variable
+#    y = f(B'x) + err
+# with f(x1) = x1 and err ~ N(0, 0.25^2)
+y <- x \%*\% B + 0.25 * rnorm(100)
+
+x.train <- x[1:80, ]
+x.test  <- x[81:100, ]
+y.train <- y[1:80, ]
+y.test  <- y[81:100, ]
+
+# calculate cve with method 'simple' for k = 1
+cve.obj.simple <- cve(y.train ~ x.train, k = 1) 
+
+# predict y.test from x.test
+yhat <- predict(cve.obj.simple, x.test, 1)
+
+# plot prediction against y.test
+plot(yhat, y.test)
+}
 \seealso{
 \code{\link{cve}}, \code{\link{cve.call}} or \pkg{\link{mars}}.
 }
diff --git a/CVE_C/man/predict.dim.Rd b/CVE_C/man/predict.dim.Rd
deleted file mode 100644
index 08f4490..0000000
--- a/CVE_C/man/predict.dim.Rd
+++ /dev/null
@@ -1,24 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/predict_dim.R
-\name{predict.dim}
-\alias{predict.dim}
-\title{Predicts SDR dimension using \code{\link[mda]{mars}} via a Cross-Validation.}
-\usage{
-predict.dim(object, ...)
-}
-\arguments{
-\item{object}{instance of class \code{cve} (result of \code{cve},
-\code{cve.call}).}
-
-\item{...}{ignored.}
-}
-\value{
-list with
-\itemize{
-   \item MSE: Mean Square Error,
-   \item k: predicted dimensions.
-}
-}
-\description{
-Predicts SDR dimension using \code{\link[mda]{mars}} via a Cross-Validation.
-}
diff --git a/CVE_C/man/summary.cve.Rd b/CVE_C/man/summary.cve.Rd
index 028e08a..731bb66 100644
--- a/CVE_C/man/summary.cve.Rd
+++ b/CVE_C/man/summary.cve.Rd
@@ -14,3 +14,22 @@
 \description{
 Prints a summary of a \code{cve} result.
 }
+\examples{
+# create B for simulation
+B <- rep(1, 5) / sqrt(5)
+
+set.seed(21)
+#creat predictor data x ~ N(0, I_p)
+x <- matrix(rnorm(500), 100)
+
+# simulate response variable
+#    y = f(B'x) + err
+# with f(x1) = x1 and err ~ N(0, 0.25^2)
+y <- x \%*\% B + 0.25 * rnorm(100)
+
+# calculate cve for unknown k between min.dim and max.dim.
+cve.obj.simple <- cve(y ~ x)
+
+summary(cve.obj.simple)
+
+}
diff --git a/CVE_C/src/callLogger.c b/CVE_C/src/callLogger.c
index f1eb13a..c51cef5 100644
--- a/CVE_C/src/callLogger.c
+++ b/CVE_C/src/callLogger.c
@@ -13,22 +13,18 @@
  *
  * @param logger Pointer to a SEXP R object representing an R function.
  * @param env Pointer to a SEXP R object representing an R environment.
+ * @param attempt counter of attempts.
+ * @param iter optimization iteration counter.
  * @param L Pointer to a SEXP R object representing an R environment.
  * @param V Pointer memory area of size `nrowV * ncolV` storing `V`.
- * @param nrowV Nr. of rows of `V`.
- * @param ncolV Nr. of columns of `V`.
  * @param G Pointer memory area of size `nrowG * ncolG` storing `G`.
- * @param nrowG Nr. of rows of `G`.
- * @param ncolG Nr. of columns of `G`.
  * @param loss Current loss L(V).
  * @param err Errof for break condition (0.0 befor first iteration).
  * @param tau Current step-size.
  */
 void callLogger(SEXP logger, SEXP env,
                 const int attempt, const int iter,
-                const double* L, const int lenL,
-                const double* V, const int nrowV, const int ncolV,
-                const double* G, const int nrowG, const int ncolG,
+                const mat* L, const mat* V, const mat* G,
                 const double loss, const double err, const double tau) {
     /* Create R objects to be passed to R logger function. */
     // Attempt is converted from 0-indexed to 1-indexed as R index.
@@ -36,13 +32,13 @@ void callLogger(SEXP logger, SEXP env,
     SEXP r_iter = PROTECT(ScalarInteger(iter + 1));
 
     /* Create R representations of L, V and G */
-    SEXP r_L = PROTECT(allocVector(REALSXP, lenL));
-    SEXP r_V = PROTECT(allocMatrix(REALSXP, nrowV, ncolV));
-    SEXP r_G = PROTECT(allocMatrix(REALSXP, nrowG, ncolG));
+    SEXP r_L = PROTECT(allocVector(REALSXP, L->nrow));
+    SEXP r_V = PROTECT(allocMatrix(REALSXP, V->nrow, V->ncol));
+    SEXP r_G = PROTECT(allocMatrix(REALSXP, G->nrow, G->ncol));
     /* Copy data to R objects */
-    memcpy(REAL(r_L), L, lenL * sizeof(double));
-    memcpy(REAL(r_V), V, nrowV * ncolV * sizeof(double));
-    memcpy(REAL(r_G), G, nrowG * ncolG * sizeof(double));
+    memcpy(REAL(r_L), L->elem, L->nrow * sizeof(double));
+    memcpy(REAL(r_V), V->elem, V->nrow * V->ncol * sizeof(double));
+    memcpy(REAL(r_G), G->elem, G->nrow * G->ncol * sizeof(double));
 
     /* Build data list passed to logger */
     SEXP data = PROTECT(allocVector(VECSXP, 6));
diff --git a/CVE_C/src/cve.c b/CVE_C/src/cve.c
index 0b25d00..d13d1a6 100644
--- a/CVE_C/src/cve.c
+++ b/CVE_C/src/cve.c
@@ -2,121 +2,113 @@
 
 #include "cve.h"
 
-// TODO: clarify
-static inline double gaussKernel(const double x, const double scale) {
-    return exp(scale * x * x);
-}
-
-void cve_sub(const int n, const int p, const int q,
-             const double *X, const double *Y, const double h,
-             const unsigned int method,
-             const double momentum,
-             const double tau_init, const double tol_init,
-             const double slack, const double gamma,
-             const int maxIter, const int attempts,
-             double *V, double *L,
-             SEXP logger, SEXP loggerEnv) {
-
-    int attempt = 0, iter, i, nn = (n * (n - 1)) / 2;
+void cve(const mat *X, const mat *Y, const double h,
+         const unsigned int method,
+         const double momentum,
+         const double tau_init, const double tol_init,
+         const double slack, const double gamma,
+         const int maxIter, const int attempts,
+         mat *V, mat *L,
+         SEXP logger, SEXP loggerEnv) {
+    
+    // TODO: param and dim. validation.
+    int n = X->nrow, p = X->ncol, q = V->ncol;
+    int attempt = 0, iter;
     double loss, loss_last, loss_best, err, tau;
     double tol = tol_init * sqrt((double)(2 * q));
-    double gKscale = -0.5 / (h * h);
     double agility = -2.0 * (1.0 - momentum) / (h * h);
     double c = agility / (double)n;
 
+    // TODO: check parameters! dim, ...
+
     /* Create further intermediate or internal variables. */
-    double *Q       = (double*)R_alloc(p * p,  sizeof(double));
-    double *V_best  = (double*)R_alloc(p * q,  sizeof(double));
-    double *L_best  = (double*)R_alloc(n,      sizeof(double));
-    double *V_tau   = (double*)R_alloc(p * q,  sizeof(double));
-    double *X_diff  = (double*)R_alloc(nn * p, sizeof(double));
-    double *X_proj  = (double*)R_alloc(nn * p, sizeof(double));
-    double *y1      = (double*)R_alloc(n,      sizeof(double));
-    double *vecD    = (double*)R_alloc(nn,     sizeof(double));
-    double *vecK    = (double*)R_alloc(nn,     sizeof(double));
-    double *vecS    = (double*)R_alloc(nn,     sizeof(double));
-    double *colSums = (double*)R_alloc(n,      sizeof(double));
-    double *G       = (double*)R_alloc(p * q,  sizeof(double));
-    double *A       = (double*)R_alloc(p * p,  sizeof(double));
+    mat *lvecD_e  = (void*)0;
+    mat *Ysquared = (void*)0;
+    mat *XV       = (void*)0;
+    mat *lvecD    = (void*)0; // TODO: combine. aka in-place lvecToSym
+    mat *D        = (void*)0; // TODO: combine. aka in-place lvecToSym
+    mat *lvecK    = (void*)0; // TODO: combine. aka in-place lvecToSym
+    mat *K        = (void*)0; // TODO: combine. aka in-place lvecToSym
+    mat *colSumsK = (void*)0;
+    mat *W        = (void*)0;
+    mat *y1       = (void*)0;
+    mat *y2       = (void*)0;
+    mat *S        = (void*)0;
+    mat *tmp1     = (void*)0;
+    mat *tmp2     = (void*)0;
+    mat *G        = (void*)0;
+    mat *A        = (void*)0;
+    mat *V_tau    = (void*)0;
+    mat *V_best   = (void*)0;
+    mat *L_best   = (void*)0;
 
-    double *V_init = (void*)0;
-    if (attempts < 1) {
-        V_init = (double*)R_alloc(p * q, sizeof(double));
-        memcpy(V_init, V, p * q * sizeof(double));
+    /* Allocate appropiate amount of working memory. */
+    int workLen = 2 * (p + 1) * p;
+    if (workLen < n) {
+        workLen = n;
     }
-
-    /* Determine size of working memory used by subroutines. */
-    const int workLen = getWorkLen(n, p, q);
     double *workMem = (double*)R_alloc(workLen, sizeof(double));
 
-    /* Compute X_diff, this is static for the entire algorithm. */
-    rowDiffs(X, n, p, X_diff);
+    lvecD_e = rowDiffSquareSums(X, lvecD_e);
+    Ysquared = hadamard(1.0, Y, Y, 0.0, Ysquared);
 
     do {
         /* (Re)set learning rate. */
         tau = tau_init;
 
         /* Check if start value for `V` was supplied. */
-        if (V_init == (void*)0) {
+        if (attempts > 0) {
             /* Sample start value from stiefel manifold. */
-            rStiefel(p, q, V, workMem, workLen);
-        } else {
-            /* (Re)Set start value of `V` to `V_init`. */
-            memcpy(V, V_init, p * q * sizeof(double));
+            V = rStiefel(p, q, V, workMem);
         }
 
-        /* Create projection matrix `Q <- I - V V^T` for initial `V`. */
-        nullProj(V, p, q, Q);
-
-        /* Compute Distance vector. */
-        matrixprod(X, n, p, Q, p, p, X_proj); // here X_proj is only `(n, p)`.
-        rowDiffSquareSums(X_proj, n, p, vecD);
-
+        /* Embed X_i's in V space */
+        XV = matrixprod(1.0, X, V, 0.0, XV);
+        /* Compute embedded distances */
+        lvecD = lincomb(1.0, lvecD_e, -1.0, rowDiffSquareSums(XV, lvecD));
         /* Apply kernel to distances. */
-        for (i = 0; i < nn; ++i) {
-            vecK[i] = gaussKernel(vecD[i], gKscale);
+        lvecK = applyKernel(lvecD, h, gauss, lvecK);
+        /* Transform lower vectors lvecD, lvecK into sym. matrices. */
+        D = lvecToSym(lvecD, 0.0, D);
+        K = lvecToSym(lvecK, 1.0, K);
+        /* Compute column sums of kernel matrix K */
+        colSumsK = colSums(K, colSumsK);
+        /* Normalize K columns to obtain weight matrix W */
+        W = colApply(K, '/', colSumsK, W);
+        /* first and second order weighted responces */
+        y1 = matrixprod(1.0, W, Y,        0.0, y1);
+        y2 = matrixprod(1.0, W, Ysquared, 0.0, y2);
+        /* Compute losses */
+        L = hadamard(-1.0, y1, y1, 1.0, copy(y2, L));
+        /* Compute initial loss */
+        if (method == simple) {
+            loss_last = mean(L);
+            /* Calculate the scaling matrix S */
+            S = laplace(adjacence(L, Y, y1, D, W, gauss, S), workMem);
+        } else if (method == weighted) {
+            colSumsK = elemApply(colSumsK, '-', 1.0, colSumsK);
+            loss_last = dot(L, '/', colSumsK);
+            c = agility / sum(colSumsK);
+            /* Calculate the scaling matrix S */
+            S = laplace(adjacence(L, Y, y1, D, K, gauss, S), workMem);
+        } else {
+            // TODO: error handling!
         }
-
-        /* Compute col(row) sums of kernal vector (sym. packed lower tri
-         * matrix.), because `K == K^T` the rowSums are equal to colSums. */
-        rowSumsSymVec(vecK, n, 1.0, colSums);
-
-        /* Compute loss given the kernel vector and its column sums.
-         * Additionally the first momentum `y1` is computed and stored in
-         * the working memory (only intermidiate result, needed for `vecS`). */
-        loss_last = cost(method, n, Y, vecK, colSums, y1, L);
-
-        /* Calc the scaling vector used for final computation of grad. */
-        scaling(method, n, Y, y1, L, vecD, vecK, colSums, vecS);
-
-        /* Compute the eucledian gradient `G`. */
-        rowSweep(X_diff, nn, p, "*", vecS, X_proj);
-        crossprod(X_diff, nn, p, X_proj, nn, p, workMem);
-        matrixprod(workMem, p, p, V, p, q, G);
-        if (method == CVE_METHOD_WEIGHTED) {
-            /* Compute summ of all kernel applied distances by summing the
-             * colSums of the kernel matrix. */
-            // c = -(double)n; // to scale with sum(K) - n
-            // for (i = 0; i < n; ++i) {
-            //     c += colSums[i];
-            // }
-            // TODO: check for division by zero, but should not happen!!!
-            c = agility / (sum(colSums, n) - (double)n);
-        }
-        scale(c, G, p * q); // in-place
+        /* Gradient */
+        tmp1 = matrixprod(1.0, S, X, 0.0, tmp1);
+        tmp2 = crossprod(1.0, X, tmp1, 0.0, tmp2);
+        G = matrixprod(c, tmp2, V, 0.0, G);
 
         if (logger) {
             callLogger(logger, loggerEnv,
                        attempt, /* iter <- 0L */ -1,
-                       L, n,
-                       V, p, q,
-                       G, p, q,
+                       L, V, G,
                        loss_last, /* err <- NA */ -1.0, tau);
         }
 
         /* Compute Skew-Symmetric matrix `A` used in Cayley transform.
          * `A <- tau * (G V^T - V G^T) + 0 * A`*/
-        skew(p, q, tau, G, V, 0.0, A);
+        A = skew(tau, G, V, 0.0, A);
 
         for (iter = 0; iter < maxIter; ++iter) {
             /* Before Starting next iteration check if the Uer has requested an
@@ -126,50 +118,57 @@ void cve_sub(const int n, const int p, const int q,
             R_CheckUserInterrupt();
 
             /* Move `V` along the gradient direction. */
-            cayleyTransform(p, q, A, V, V_tau, workMem);
+            V_tau = cayleyTransform(A, V, V_tau, workMem);
 
-            /* Create projection matrix for `V_tau`. */
-            nullProj(V_tau, p, q, Q);
-
-            /* Compute Distance vector. */
-            matrixprod(X, n, p, Q, p, p, X_proj); // here X_proj only `(n, p)`.
-            rowDiffSquareSums(X_proj, n, p, vecD);
+            // Rprintf("Start attempt(%2d), iter (%2d): err: %f, loss: %f, tau: %f\n",
+            //         attempt, iter, dist(V, V_tau), loss_last, tau);
 
+            /* Embed X_i's in V space */
+            XV = matrixprod(1.0, X, V_tau, 0.0, XV);
+            /* Compute embedded distances */
+            lvecD = lincomb(1.0, lvecD_e, -1.0, rowDiffSquareSums(XV, lvecD));
             /* Apply kernel to distances. */
-            for (i = 0; i < nn; ++i) {
-                vecK[i] = gaussKernel(vecD[i], gKscale);
+            lvecK = applyKernel(lvecD, h, gauss, lvecK);
+            /* Transform lower vectors lvecD, lvecK into sym. matrices. */
+            D = lvecToSym(lvecD, 0.0, D);
+            K = lvecToSym(lvecK, 1.0, K);
+            /* Compute column sums of kernel matrix K */
+            colSumsK = colSums(K, colSumsK);
+            /* Normalize K columns to obtain weight matrix W */
+            W = colApply(K, '/', colSumsK, W);
+            /* first and second order weighted responces */
+            y1 = matrixprod(1.0, W, Y,        0.0, y1);
+            y2 = matrixprod(1.0, W, Ysquared, 0.0, y2);
+            /* Compute losses */
+            L = hadamard(-1.0, y1, y1, 1.0, copy(y2, L));
+            /* Compute loss */
+            if (method == simple) {
+                loss = mean(L);
+            } else if (method == weighted) {
+                colSumsK = elemApply(colSumsK, '-', 1.0, colSumsK);
+                loss = dot(L, '/', colSumsK);
+            } else {
+                // TODO: error handling!
             }
 
-            /* Compute col(row) sums of kernal vector (sym. packed lower tri
-             * matrix.), because `K == K^T` the rowSums are equal to colSums. */
-            rowSumsSymVec(vecK, n, 1.0, colSums);
-
-            /* Compute loss given the kernel vector and its column sums.
-             * Additionally the first momentum `y1` is computed and stored in
-             * the working memory (only intermidiate result, needed for vecS).*/
-            loss = cost(method, n, Y, vecK, colSums, y1, L);
-
             /* Check if step is appropriate, iff not reduce learning rate. */
             if ((loss - loss_last) > loss_last * slack) {
                 tau *= gamma;
-                scale(gamma, A, p * p);
+                A = elemApply(A, '*', gamma, A); // scale A by gamma
                 continue;
             }
 
-            /* Compute error, use workMem (keep first `n`, they store `y1`). */
-            skew(p, q, 1.0, V, V_tau, 0.0, workMem);
-            err = norm(workMem, p, p, "F");
+            /* Compute error, use workMem. */
+            err = dist(V, V_tau);
 
             /* Shift next step to current step and store loss to last. */
-            memcpy(V, V_tau, p * q * sizeof(double));
+            V = copy(V_tau, V);            
             loss_last = loss;
 
             if (logger) {
                 callLogger(logger, loggerEnv,
                            attempt, iter,
-                           L, n,
-                           V, p, q,
-                           G, p, q,
+                           L, V, G,
                            loss, err, tau);
             }
 
@@ -178,41 +177,35 @@ void cve_sub(const int n, const int p, const int q,
                 break;
             }
 
-            /* Continue computing the gradient. */
-            /* Calc the scaling vector used for final computation of grad. */
-            scaling(method, n, Y, y1, L, vecD, vecK, colSums, vecS);
-
-            /* Compute the eucledian gradient `G`. */
-            rowSweep(X_diff, nn, p, "*", vecS, X_proj);
-            crossprod(X_diff, nn, p, X_proj, nn, p, workMem);
-            //     /* Update without momentum */
-            //     matrixprod(workMem, p, p, V, p, q, G);
-            //     scale(-2. / (((double)n) * h * h), G, p * q); // in-place
-            /* G <- momentum * G + agility * workMem V */
-
-            if (method == CVE_METHOD_WEIGHTED) {
-                /* Compute summ of all kernel applied distances by summing the
-                 * colSums of the kernel matrix. */
-                // TODO: check for division by zero, but should not happen!!!
-                c = agility / (sum(colSums, n) - (double)n);
+            if (method == simple) {
+                /* Calculate the scaling matrix S */
+                S = laplace(adjacence(L, Y, y1, D, W, gauss, S), workMem);
+            } else if (method == weighted) {
+                /* Calculate the scaling matrix S */
+                S = laplace(adjacence(L, Y, y1, D, K, gauss, S), workMem);
+                c = agility / sum(colSumsK);
+            } else {
+                // TODO: error handling!
             }
-            F77_NAME(dgemm)("N", "N", &p, &q, &p,
-                            &c, workMem, &p, V, &p,
-                            &momentum, G, &p);
+
+            /* Gradient */
+            tmp1 = matrixprod(1.0, S, X, 0.0, tmp1);
+            tmp2 = crossprod(1.0, X, tmp1, 0.0, tmp2);
+            G = matrixprod(c, tmp2, V, momentum, G);
 
             /* Compute Skew-Symmetric matrix `A` used in Cayley transform.
              * `A <- tau * (G V^T - V G^T) + 0 * A`*/
-            skew(p, q, tau, G, V, 0.0, A);
+            A = skew(tau, G, V, 0.0, A);
         }
 
         /* Check if current attempt improved previous ones */
         if (attempt == 0 || loss < loss_best) {
             loss_best = loss;
-            memcpy(V_best, V, p * q * sizeof(double));
-            memcpy(L_best, L, n * sizeof(double));
+            V_best = copy(V, V_best);
+            L_best = copy(L, L_best);
         }
     } while (++attempt < attempts);
 
-    memcpy(V, V_best, p * q * sizeof(double));
-    memcpy(L, L_best, n * sizeof(double));
+    V = copy(V_best, V);
+    L = copy(L_best, L);
 }
diff --git a/CVE_C/src/cve.h b/CVE_C/src/cve.h
index 6216ac4..4c60f2b 100644
--- a/CVE_C/src/cve.h
+++ b/CVE_C/src/cve.h
@@ -12,107 +12,90 @@
 
 #define CVE_MEM_CHUNK_SIZE 2032
 #define CVE_MEM_CHUNK_SMALL 1016
+#define BLOCK_SIZE 8
 
-/* Bis masks for method types */
-#define CVE_METHOD_WEIGHTED 1
+/**
+ * @struct Matrix of dimensions `nrow x ncol`.
+ */
+typedef struct matrix {
+    int nrow;     /**< Number of rows */
+    int ncol;     /**< Number of columns */
+    void *origin; /**< Reference to origin, see `asMat()`. */
+    double *elem; /**< Column-major array of matrix elements. */
+} mat;
 
-// typedef struct {
-//     unsigned int nrow;
-//     unsigned int ncol;
-//     unsigned int memsize;
-//     double *data;
-// } mat;
+typedef enum {
+    simple,
+    weighted
+} method;
 
-// mat* Matrix(const unsigned int nrow, const unsigned int ncol);
+typedef enum {
+    gauss
+} kernel;
 
-void cve_sub(const int n, const int p, const int q,
-             const double *X, const double *Y, const double h,
-             const unsigned int method,
-             const double momentum,
-             const double tau_init, const double tol_init,
-             const double slack, const double gamma,
-             const int maxIter, int attempts,
-             double *V, double *L,
-             SEXP logger, SEXP loggerEnv);
+
+void cve(const mat *X, const mat *Y, const double h,
+         const unsigned int method,
+         const double momentum,
+         const double tau_init, const double tol_init,
+         const double slack, const double gamma,
+         const int maxIter, const int attempts,
+         mat *V, mat *L,
+         SEXP logger, SEXP loggerEnv);
 
 void callLogger(SEXP logger, SEXP env,
-                const int attempt, const int epoch,
-                const double* L, const int lenL,
-                const double* V, const int nrowV, const int ncolV,
-                const double* G, const int nrowG, const int ncolG,
+                const int attempt, const int iter,
+                const mat* L, const mat* V, const mat* G,
                 const double loss, const double err, const double tau);
 
-/* CVE sub-routines */
-int getWorkLen(const int n, const int p, const int q);
-double cost(const unsigned int method,
-            const int n,
-            const double *Y,
-            const double *vecK,
-            const double *colSums,
-            double *y1, double *L);
-void scaling(const unsigned int method,
-             const int n,
-             const double *Y, const double *y1, const double *L,
-             const double *vecD, const double *vecK,
-             const double *colSums,
-             double *vecS);
+/******************************************************************************/
+/**                                rStiefel.c                                **/
+/******************************************************************************/
+/* Random element from Stiefel manifold. */
+mat* rStiefel(const int p, const int q, mat *V, double *workMem);
 
-/* rStiefel */
-void rStiefel(const int p, const int q, double *V,
-              double *workMem, int workLen);
+/******************************************************************************/
+/**                                 matrix.c                                 **/
+/******************************************************************************/
+/* Create and Copy matrices */
+mat* matrix(const int nrow, const int ncol);
+mat* zero(const int nrow, const int ncol);
+mat* copy(mat *src, mat *dest);
+/* Matrix to scalar */
+double sum(const mat *A);
+double mean(const mat *A);
+double squareSum(const mat* A);
+double dot(const mat *A, const char op, const mat *B);
+double dist(const mat *A, const mat *B);
+/* Matrix to vector (`ncol == 1` matrices, aka row vectors) */
+mat* rowSums(const mat *A, mat *sums);
+mat* colSums(const mat *A, mat *sums);
+mat* rowDiffSquareSums(const mat *X, mat *lvecD);
+/* Matrix and scalar to Matrix */
+mat* elemApply(mat *A, const char op, const double scalar, mat *B);
+/* Matrix and vector to Matrix */
+mat* colApply(mat *A, const char op, mat *B, mat *C);
+/* Matrix and Matrix to Matrix */
+mat* lincomb(const double alpha, const mat *A, const double beta, mat *B);
+mat* matrixprod(const double alpha, const mat *A, const mat *B,
+                double beta, mat* C);
+mat* crossprod(const double alpha, const mat *A, const mat *B,
+               double beta, mat* C);
+mat* hadamard(const double alpha, const mat* A, const mat *B,
+              double beta, mat* C);
+mat* skew(const double alpha, mat* A, mat *B, double beta, mat* C);
+/* Matrix Transformations */
+mat* cayleyTransform(mat *A, mat *B, mat *C, double *workMem);
+mat* laplace(mat *A, double *workMem);
+mat* lvecToSym(const mat* A, const double diag, mat* B);
 
-/* MATRIX */
-double sum(const double *A, const int nelem);
-
-double norm(const double *A, const int nrow, const int ncol,
-            const char *type);
-
-void matrixprod(const double *A, const int nrowA, const int ncolA,
-                const double *B, const int nrowB, const int ncolB,
-                double *C);
-
-void crossprod(const double *A, const int nrowA, const int ncolA,
-               const double *B, const int nrowB, const int ncolB,
-               double *C);
-
-void skew(const int nrow, const int ncol,
-          double alpha, const double *A, const double *B,
-          double beta,
-          double *C);
-
-void nullProj(const double *B, const int nrowB, const int ncolB,
-              double *Q);
-
-void scale(const double s, double *A, const int nelem);
-
-void cayleyTransform(const int p, const int q,
-                     const double *A, const double *B,
-                     double *X, double *workMem);
-
-/* Row and column opperations. */
-void rowSums(const double *A, const int nrow, const int ncol,
-             double *sum);
-
-void colSums(const double *A, const int nrow, const int ncol,
-             double *sum);
-
-void rowSquareSums(const double *A, const int nrow, const int ncol,
-                   double *sum);
-
-void rowSumsSymVec(const double *Avec, const int nrow,
-                   const double diag,
-                   double *sum);
-
-void rowDiffs(const double* X, const int nrow, const int ncol,
-              double *diffs);
-
-void rowDiffSquareSums(const double* X, const int nrow, const int ncol,
-                       double *sum);
-
-/* SWEEP */
-void rowSweep(const double *A, const int nrow, const int ncol,
-              const char* op,
-              const double *v, // vector of length nrow
-              double *C);
+/******************************************************************************/
+/**                            cve_subroutines.c                             **/
+/******************************************************************************/
+/* CVE specific sub-routines */
+mat* adjacence(const mat *vec_L, const mat *vec_Y, const mat *vec_y1,
+               const mat *mat_D, const mat *mat_W, kernel kernel,
+               mat *mat_S);
+mat* applyKernel(const mat* A, const double h, kernel kernel, mat* B);
 
 #endif /* CVE_INCLUDE_GUARD_H_ */
diff --git a/CVE_C/src/cve_subroutines.c b/CVE_C/src/cve_subroutines.c
index ad3faf7..50ef975 100644
--- a/CVE_C/src/cve_subroutines.c
+++ b/CVE_C/src/cve_subroutines.c
@@ -1,95 +1,257 @@
 #include "cve.h"
 
-int getWorkLen(const int n, const int p, const int q) {
-    int mpq; /**< Max of p and q */
-    int nn = ((n - 1) * n) / 2;
+/**
+ * Applies the requested kernel element-wise.
+ *
+ * @param A matrix to apply the kernel to.
+ * @param h bandwidth parameter.
+ * @param kernel the kernel to be used.
+ * @param B (in/out) matrix `A` with element-wise applied kernel.
+ *
+ * @returns ether the passed `B`, or a new created matrix if `B` was NULL.
+ */
+mat* applyKernel(const mat* A, const double h, kernel kernel, mat* B) {
+    int i, nn = A->nrow * A->ncol;
+    int nn4 = 4 * (nn / 4);
+    double scale;
+    const double * restrict a;
+    double * restrict b;
 
-    if (p > q) {
-        mpq = p;
-    } else {
-        mpq = q;
+    if (!B) {
+        B = matrix(A->nrow, A->ncol);
+    } else if (nn != B->nrow * B->ncol) {
+        // TODO: error handling!
     }
-    if (nn * p < (mpq + 1) * mpq) {
-        return 2 * (mpq + 1) * mpq;
-    } else {
-        return (nn + mpq) * mpq;
+
+    a = A->elem;
+    b = B->elem;
+    switch (kernel) {
+        case gauss:
+            scale = -0.5 / (h * h);
+            for (i = 0; i < nn4; i += 4) {
+                b[i]     = exp(scale * a[i]     * a[i]);
+                b[i + 1] = exp(scale * a[i + 1] * a[i + 1]);
+                b[i + 2] = exp(scale * a[i + 2] * a[i + 2]);
+                b[i + 3] = exp(scale * a[i + 3] * a[i + 3]);
+            }
+            for (; i < nn; ++i) {
+                b[i] = exp(scale * a[i] * a[i]);
+            }
+            break;
+        default:
+            // TODO: error handling!
+            break;
     }
+
+    return B;
 }
 
-double cost(const unsigned int method,
-            const int n,
-            const double *Y,
-            const double *vecK,
-            const double *colSums,
-            double *y1, double *L) {
-    int i, j, k;
-    double tmp, sum;
+/**
+ * Scaling matrix `S` defined as
+ *      s_{i j} = (L_i - (Y_j - y1_i)^2) * d_{i j} * w_{i j}
+ *
+ * Mapping of vectors `L`, `y1` and `Y` combinations.
+ *
+ *                                       Y[j]
+ *                                  ------ j ----->
+ *                  s[0]    s[n]  . . .  s[jn]  . . .  s[(j-1)n]
+ *                  s[1]   s[n+1] . . . s[jn+1] . . . s[(j-1)n+1]
+ *               |    .       .   .        .    .          .
+ *               |    .       .     .      .      .        .
+ *                    .       .       .    .        .      .
+ *  L[i], y1[i]  i  s[i]   s[n+i] . . . s[jn+i] . . . s[(j-1)n+i]
+ *                    .       .   .        .    .          .
+ *               |    .       .     .      .      .        .
+ *               v    .       .       .    .        .      .
+ *                 s[n-1] s[2n-1] . . .  s[n-1] . . .   s[nn-1]
+ *
+ * @param L per sample loss vector of (lenght `n`).
+ * @param Y responces (lenght `n`).
+ * @param y1 weighted responces (lenght `n`).
+ * @param D distance matrix (dim. `n x n`).
+ * @param W weight matrix (dim. `n x n`).
+ * @param kernel the kernel to be used.
+ *
+ * @returns passed matrix `S` if not NULL, then a new `n x n` matrix is created.
+ *
+ * @example Basically equivalent to the following R function.
+ * r_LS <- function(L, Y, y1, D, W) {
+ *     # get dimension
+ *     n <- length(L)
+ *     # Indices
+ *     i <- rep(1:n, n)
+ *     j <- rep(1:n, each = n)
+ *     # Compute S
+ *     matrix(L[i] - (Y[j] - y1[i])^2, n) * W * D
+ * }
+ * 
+ * @details mapping for indices used in blocked implementation.
+ *
+ * n ..... Dimensions of `S`, a `n x n` matrix.
+ * B ..... BLOCK_SIZE
+ * rB .... block reminder, aka rB = B % n.
+ *
+ *                                    Y[j]
+ *                            0  -----  j  ----->  n
+ *                          0 +--------------------+
+ *                            | '                  |
+ *                          ' | k      B x n       |
+ *                          ' | v                  |
+ *                          ' +--------------------+
+ *            L[i], y1[i]   i | '                  |
+ *                          ' | k      B x n       |
+ *                          ' | v                  |
+ *                          v +--------------------+
+ *                            | k     rB x n       |
+ *                          n +--------------------+
+ */
+// TODO: fix: cache misses in Y?!
+mat* adjacence(const mat *vec_L, const mat *vec_Y, const mat *vec_y1,
+               const mat *mat_D, const mat *mat_W, kernel kernel,
+               mat *mat_S) {
+    int i, j, k, n = vec_L->nrow;
+    int block_size, block_batch_size;
+    int max_size = 64 < n ? 64 : n; // Block Size set to 64
 
-    for (i = 0; i < n; ++i) {
-        y1[i] = Y[i];
-        L[i] = Y[i] * Y[i];
+    double Y_j, tmp0, tmp1, tmp2, tmp3;
+    double *Y  = vec_Y->elem;
+    double *L  = vec_L->elem;
+    double *y1 = vec_y1->elem;
+    double *D, *W, *S;
+
+    // TODO: Check dims.
+
+    if (!mat_S) {
+        mat_S = matrix(n, n);
     }
 
-    for (k = j = 0; j < n; ++j) {
-        for (i = j + 1; i < n; ++i, ++k) {
-            y1[i] += Y[j] * vecK[k];
-            y1[j] += Y[i] * vecK[k];
-            L[i] += Y[j] * Y[j] * vecK[k];
-            L[j] += Y[i] * Y[i] * vecK[k];
+    for (i = 0; i < n; i += block_size) {
+        /* Set blocks (left upper corner) */
+        S = mat_S->elem + i;
+        D = mat_D->elem + i;
+        W = mat_W->elem + i;
+        /* determine block size */
+        block_size = n - i;
+        if (block_size > max_size) {
+            block_size = max_size;
         }
-    }
-
-    for (i = 0; i < n; ++i) {
-        y1[i] /= colSums[i];
-        L[i] /= colSums[i];
-    }
-
-    tmp = 0.0;
-    if (method == CVE_METHOD_WEIGHTED) {
-        sum = 0.0;
-        for (i = 0; i < n; ++i) {
-            tmp += (colSums[i] - 1.0) * (L[i] -= y1[i] * y1[i]);
-            sum += colSums[i];
-        }
-        return tmp / (sum - (double)n); // TODO: check for division by zero!
-    } else {
-        for (i = 0; i < n; ++i) {
-            tmp += (L[i] -= y1[i] * y1[i]);
-        }
-        return tmp / (double)n;
-    }
-}
-
-void scaling(const unsigned int method,
-             const int n,
-             const double *Y, const double *y1, const double *L,
-             const double *vecD, const double *vecK,
-             const double *colSums,
-             double *vecS) {
-    int i, j, k, nn = (n * (n - 1)) / 2;
-    double tmp;
-
-    if (method == CVE_METHOD_WEIGHTED) {
-        for (k = j = 0; j < n; ++j) {
-            for (i = j + 1; i < n; ++i, ++k) {
-                tmp = Y[j] - y1[i];
-                vecS[k]  = (L[i] - (tmp * tmp));
-                tmp = Y[i] - y1[j];
-                vecS[k] += (L[j] - (tmp * tmp));
-            }
-        }
-    } else {
-        for (k = j = 0; j < n; ++j) {
-            for (i = j + 1; i < n; ++i, ++k) {
-                tmp = Y[j] - y1[i];
-                vecS[k]  = (L[i] - (tmp * tmp)) / colSums[i];
-                tmp = Y[i] - y1[j];
-                vecS[k] += (L[j] - (tmp * tmp)) / colSums[j];
+        block_batch_size = 4 * (block_size / 4);
+        /* for each column in the block */
+        for (j = 0; j < n; ++j, S += n, D += n, W += n) {
+            Y_j = Y[j];
+            /* iterate over block rows */
+            for (k = 0; k < block_batch_size; k += 4) {
+                tmp0 = Y_j - y1[k];
+                tmp1 = Y_j - y1[k + 1];
+                tmp2 = Y_j - y1[k + 2];
+                tmp3 = Y_j - y1[k + 3];
+                S[k]     = (L[k]     - (tmp0 * tmp0)) * D[k]     * W[k];
+                S[k + 1] = (L[k + 1] - (tmp1 * tmp1)) * D[k + 1] * W[k + 1];
+                S[k + 2] = (L[k + 2] - (tmp2 * tmp2)) * D[k + 2] * W[k + 2];
+                S[k + 3] = (L[k + 3] - (tmp3 * tmp3)) * D[k + 3] * W[k + 3];
+            }
+            for (; k < block_size; ++k) {
+                tmp0 = Y_j - y1[k];
+                S[k] = (L[k] - (tmp0 * tmp0)) * D[k] * W[k];
             }
         }
+        L  += block_size;
+        y1 += block_size;
     }
 
-    for (k = 0; k < nn; ++k) {
-        vecS[k] *= vecK[k] * vecD[k];
-    }
+    return mat_S;
 }
+
+// int getWorkLen(const int n, const int p, const int q) {
+//     int mpq; /**< Max of p and q */
+//     int nn = ((n - 1) * n) / 2;
+
+//     if (p > q) {
+//         mpq = p;
+//     } else {
+//         mpq = q;
+//     }
+//     if (nn * p < (mpq + 1) * mpq) {
+//         return 2 * (mpq + 1) * mpq;
+//     } else {
+//         return (nn + mpq) * mpq;
+//     }
+// }
+
+// double cost(const unsigned int method,
+//             const int n,
+//             const double *Y,
+//             const double *vecK,
+//             const double *colSums,
+//             double *y1, double *L) {
+//     int i, j, k;
+//     double tmp, sum;
+
+//     for (i = 0; i < n; ++i) {
+//         y1[i] = Y[i];
+//         L[i] = Y[i] * Y[i];
+//     }
+
+//     for (k = j = 0; j < n; ++j) {
+//         for (i = j + 1; i < n; ++i, ++k) {
+//             y1[i] += Y[j] * vecK[k];
+//             y1[j] += Y[i] * vecK[k];
+//             L[i] += Y[j] * Y[j] * vecK[k];
+//             L[j] += Y[i] * Y[i] * vecK[k];
+//         }
+//     }
+
+//     for (i = 0; i < n; ++i) {
+//         y1[i] /= colSums[i];
+//         L[i] /= colSums[i];
+//     }
+
+//     tmp = 0.0;
+//     if (method == CVE_METHOD_WEIGHTED) {
+//         sum = 0.0;
+//         for (i = 0; i < n; ++i) {
+//             tmp += (colSums[i] - 1.0) * (L[i] -= y1[i] * y1[i]);
+//             sum += colSums[i];
+//         }
+//         return tmp / (sum - (double)n); // TODO: check for division by zero!
+//     } else {
+//         for (i = 0; i < n; ++i) {
+//             tmp += (L[i] -= y1[i] * y1[i]);
+//         }
+//         return tmp / (double)n;
+//     }
+// }
+
+// void scaling(const unsigned int method,
+//              const int n,
+//              const double *Y, const double *y1, const double *L,
+//              const double *vecD, const double *vecK,
+//              const double *colSums,
+//              double *vecS) {
+//     int i, j, k, nn = (n * (n - 1)) / 2;
+//     double tmp;
+
+//     if (method == CVE_METHOD_WEIGHTED) {
+//         for (k = j = 0; j < n; ++j) {
+//             for (i = j + 1; i < n; ++i, ++k) {
+//                 tmp = Y[j] - y1[i];
+//                 vecS[k]  = (L[i] - (tmp * tmp));
+//                 tmp = Y[i] - y1[j];
+//                 vecS[k] += (L[j] - (tmp * tmp));
+//             }
+//         }
+//     } else {
+//         for (k = j = 0; j < n; ++j) {
+//             for (i = j + 1; i < n; ++i, ++k) {
+//                 tmp = Y[j] - y1[i];
+//                 vecS[k]  = (L[i] - (tmp * tmp)) / colSums[i];
+//                 tmp = Y[i] - y1[j];
+//                 vecS[k] += (L[j] - (tmp * tmp)) / colSums[j];
+//             }
+//         }
+//     }
+
+//     for (k = 0; k < nn; ++k) {
+//         vecS[k] *= vecK[k] * vecD[k];
+//     }
+// }
diff --git a/CVE_C/src/export.c b/CVE_C/src/export.c
index f7b9d16..73bef89 100644
--- a/CVE_C/src/export.c
+++ b/CVE_C/src/export.c
@@ -1,27 +1,37 @@
 #include "cve.h"
 
-// SEXP rStiefel_c(SEXP pin, SEXP qin) {
-//     int p = asInteger(pin);
-//     int q = asInteger(qin);
+/**
+ * Converts a `SEXP` (S EXPression) into a matrix struct `mat`.
+ * 
+ * @param S source struct to be converted.
+ *
+ * @details Reuses the memory area of the SEXP object, therefore manipulation
+ *      of the returned matrix works in place of the SEXP object. In addition,
+ *      a reference to the original SEXP is stored and will be retriefed from
+ *      `asSEXP()` if the matrix was created through this function.
+ */
+static mat* asMat(SEXP S) {
+    // TODO: error checking and conversion
+    mat* M = (mat*)R_alloc(1, sizeof(mat));
+    if (isMatrix(S)) {
+        M->nrow = (int)nrows(S);
+        M->ncol = (int)ncols(S);
+    } else {
+        M->nrow = (int)length(S);
+        M->ncol = 1;
+    }
+    M->origin = S;
+    M->elem = REAL(S);
+    return M;
+}
 
-//     SEXP Vout = PROTECT(allocMatrix(REALSXP, p, q));
-
-//     int workLen = 2 * (p + 1) * q;
-//     double *workMem = (double*)R_alloc(workLen, sizeof(double));
-
-//     rStiefel(p, q, REAL(Vout), workMem, workLen);
-
-//     UNPROTECT(1);
-//     return Vout;
-// }
-
-SEXP cve(SEXP X, SEXP Y, SEXP k, SEXP h,
-         SEXP method,
-         SEXP V, // initial
-         SEXP momentum, SEXP tau, SEXP tol,
-         SEXP slack, SEXP gamma,
-         SEXP maxIter, SEXP attempts,
-         SEXP logger, SEXP loggerEnv) {
+SEXP cve_export(SEXP X, SEXP Y, SEXP k, SEXP h,
+                SEXP method,
+                SEXP V, // initial
+                SEXP momentum, SEXP tau, SEXP tol,
+                SEXP slack, SEXP gamma,
+                SEXP maxIter, SEXP attempts,
+                SEXP logger, SEXP loggerEnv) {
     /* Handle logger parameter, set to NULL pointer if not a function. */
     if (!(isFunction(logger) && isEnvironment(loggerEnv))) {
         logger = (void*)0;
@@ -42,20 +52,19 @@ SEXP cve(SEXP X, SEXP Y, SEXP k, SEXP h,
     /* Check `attempts`, if not positive use passed values of `V` as
      * optimization start value without further attempts.
      * Therefor, copy from `V` to `Vout`. */
-    if (asInteger(attempts) < 1L) {
+    if (asInteger(attempts) < 1) {
         // TODO: Check for 
         memcpy(REAL(Vout), REAL(V), p * q * sizeof(double));
     }
 
-    /* Call CVE simple subroutine. */
-    cve_sub(n, p, q,
-            REAL(X), REAL(Y), asReal(h),
-            asInteger(method),
-            asReal(momentum), asReal(tau), asReal(tol),
-            asReal(slack), asReal(gamma),
-            asInteger(maxIter), asInteger(attempts),
-            REAL(Vout), REAL(Lout),
-            logger, loggerEnv);
+    /* Call CVE */
+    cve(asMat(X), asMat(Y), asReal(h),
+        asInteger(method),
+        asReal(momentum), asReal(tau), asReal(tol),
+        asReal(slack), asReal(gamma),
+        asInteger(maxIter), asInteger(attempts),
+        asMat(Vout), asMat(Lout),
+        logger, loggerEnv);
 
     /* Build output list object with names "V", "L" */
     SEXP out = PROTECT(allocVector(VECSXP, 2));
diff --git a/CVE_C/src/init.c b/CVE_C/src/init.c
index 767231b..8f480c4 100644
--- a/CVE_C/src/init.c
+++ b/CVE_C/src/init.c
@@ -4,16 +4,16 @@
 #include <R_ext/Rdynload.h>
 
 /* .Call calls */
-extern SEXP cve(SEXP X, SEXP Y, SEXP k, SEXP h,
-                SEXP method,
-                SEXP V, // initial
-                SEXP momentum, SEXP tau, SEXP tol,
-                SEXP slack, SEXP gamma,
-                SEXP maxIter, SEXP attempts,
-                SEXP logger, SEXP loggerEnv);
+extern SEXP cve_export(SEXP X, SEXP Y, SEXP k, SEXP h,
+                       SEXP method,
+                       SEXP V, // initial
+                       SEXP momentum, SEXP tau, SEXP tol,
+                       SEXP slack, SEXP gamma,
+                       SEXP maxIter, SEXP attempts,
+                       SEXP logger, SEXP loggerEnv);
 
 static const R_CallMethodDef CallEntries[] = {
-    {"cve", (DL_FUNC) &cve, 15},
+    {"cve_export", (DL_FUNC) &cve_export, 15},
     {NULL, NULL, 0}
 };
 
diff --git a/CVE_C/src/matrix.c b/CVE_C/src/matrix.c
index 2e3d261..bc50f0b 100644
--- a/CVE_C/src/matrix.c
+++ b/CVE_C/src/matrix.c
@@ -1,191 +1,978 @@
 #include "cve.h"
 
-// mat* Matrix(const unsigned int nrow, const unsigned int ncol) {
-//     mat* newMat = (mat*)R_alloc(1, sizeof(mat));
-//     newMat->nrow = nrow;
-//     newMat->ncol = ncol;
-//     newMat->memsize = nrow * ncol;
-//     newMat->data = (double*)R_alloc(nrow * ncol, sizeof(double));
-
-//     return newMat;
-// }
-
-double sum(const double *A, const int nelem) {
-    int i, nelemb = (nelem / 4) * 4;
-    double sum = 0.0;
-
-    for (i = 0; i < nelemb; i += 4) {
-        sum += A[i]
-             + A[i + 1]
-             + A[i + 2]
-             + A[i + 3];
-    }
-    for (; i < nelem; ++i) {
-        sum += A[i];
-    }
-
-    return sum;
+/**
+ * Creates a matrix.
+ *
+ * @param nrow number of rows.
+ * @param ncol number of columns.
+ * 
+ * @returns an uninitialized `nrow x ncol` matrix. 
+ *
+ * @details matrix elements are NOT initialized, its content is "random".
+ */
+mat* matrix(const int nrow, const int ncol) {
+    mat* M = (mat*)R_alloc(1, sizeof(mat));
+    M->nrow = nrow;
+    M->ncol = ncol;
+    M->origin = (void*)0;
+    M->elem = (double*)R_alloc(nrow * ncol, sizeof(double));
+    return M;
 }
 
-double norm(const double *A, const int nrow, const int ncol,
-            const char *type) {
-    int i, nelem = nrow * ncol;
-    int nelemb = (nelem / 4) * 4;
-    double sum = 0.0;
+/**
+ * Creates a zero matrix.
+ *
+ * @param nrow number of rows.
+ * @param ncol number of columns.
+ * 
+ * @returns a `nrow x ncol` matrix with all elements equal 0.
+ */
+mat* zero(const int nrow, const int ncol) {
+    mat* M = matrix(nrow, ncol);
+    /* Set all emements zero */
+    memset(M->elem, 0, nrow * ncol * sizeof(double));
+    return M;
+}
 
-    if (*type == 'F') {
-        for (i = 0; i < nelemb; i += 4) {
-            sum += A[i]     * A[i]
-                 + A[i + 1] * A[i + 1]
-                 + A[i + 2] * A[i + 2]
-                 + A[i + 3] * A[i + 3];
+/**
+ * Copies elements form `src` to `dest` matrix.
+ *
+ * @param dest (in/out) destination matrix.
+ * @param src source matrix.
+ *
+ * @returns passed dest matrix.
+ */
+mat* copy(mat *src, mat *dest) {
+    if (!dest) {
+        dest = matrix(src->nrow, src->ncol);
+    } else if (src->nrow != dest->nrow || src->ncol != dest->ncol) {
+        // TODO: error handling!
+    }
+    memcpy(dest->elem, src->elem, dest->nrow * dest->ncol * sizeof(double));
+
+    return dest;
+}
+
+/**
+ * Sum of all elements.
+ * 
+ * @param A matrix.
+ * 
+ * @returns the sum of elements of `A`.
+ */
+double sum(const mat* A) {
+    int i, nn = A->nrow * A->ncol;
+    int nn4 = 4 * (nn / 4);
+    double *a = A->elem;
+    double s = 0.0;
+
+    for (i = 0; i < nn4; i += 4) {
+        s += a[i] + a[i + 1] + a[i + 2] + a[i + 3];
+    }
+    for (; i < nn; ++i) {
+        s += a[i];
+    }
+
+    return s;
+}
+
+/**
+ * Mean of all elements.
+ * 
+ * @param A matrix.
+ * 
+ * @returns the mean of elements of `A`.
+ */
+double mean(const mat* A) {
+    int i, nn = A->nrow * A->ncol;
+    int nn4 = 4 * (nn / 4);
+    double *a = A->elem;
+    double s = 0.0;
+
+    for (i = 0; i < nn4; i += 4) {
+        s += a[i] + a[i + 1] + a[i + 2] + a[i + 3];
+    }
+    for (; i < nn; ++i) {
+        s += a[i];
+    }
+
+    return s / (double)nn;
+}
+
+#define CVE_DOT_ALG(op)                   \
+    for (i = 0; i < nn4; i += 4) {        \
+        s += ((a[i])     op (b[i]))       \
+           + ((a[i + 1]) op (b[i + 1]))   \
+           + ((a[i + 2]) op (b[i + 2]))   \
+           + ((a[i + 3]) op (b[i + 3]));  \
+    }                                     \
+    for (; i < nn; ++i) {                 \
+        s += (a[i]) op (b[i]);            \
+    }
+
+/**
+ * Element-wise sum of `A` and `op(B)`.
+ * 
+ *      sum(A + B), for op = '+',
+ *      sum(A * B), for op = '*',
+ *      sum(A - B), for op = '-',
+ *      sum(A / B), for op = '/'.
+ * 
+ * @param A matrix.
+ * @param op one of '+', '-', '*' or '/'.
+ * @param B matrix of the same size as `A`.
+ * 
+ * @returns sum of element-wise products.
+ */
+double dot(const mat *A, const char op, const mat *B) {
+    int i, nn = A->nrow * A->ncol;
+    int nn4 = 4 * (nn / 4);
+    double *a = A->elem;
+    double *b = B->elem;
+    double s = 0.0;
+
+    if (B->nrow * B->ncol != nn) {
+        // TODO: error handling!
+    }
+
+    switch (op) {
+        case '+':
+            CVE_DOT_ALG(+)
+            break;
+        case '-':
+            CVE_DOT_ALG(-)
+            break;
+        case '*':
+            CVE_DOT_ALG(*)
+            break;
+        case '/':
+            CVE_DOT_ALG(/)
+            break;
+        default:
+            // TODO: error handling!
+            break;
+    }
+
+    return s;
+}
+
+/**
+ * Computes the sub-space distances according
+ * 
+ *      dist <- norm(A %*% t(A) - B %*% t(B), 'F')
+ * 
+ * @param A semi-orthogonal matrix of dim. `n x m` such that `n < m`.
+ * @param B semi-orthogonal matrix of same dimensions as `A`.
+ * 
+ * @details The semi-orthogonality will NOT be validated!
+ * 
+ * @returns dist as describet above.
+ */
+// TODO: optimize!
+double dist(const mat *A, const mat *B) {
+    int i, j, k, n = A->nrow, m = A->ncol;
+    double tmp1, tmp2, sum;
+    double *a = A->elem, *b = B->elem;
+
+    if (A->nrow != B->nrow || A->ncol != B->ncol) {
+        // TODO: error handling!
+    }
+
+    sum = 0.0;
+    for (i = 0; i < n; ++i) {
+        for (j = 0; j < n; ++j) {
+            tmp1 = tmp2 = 0.0;
+            for (k = 0; k < m; ++k) {
+                tmp1 += a[k * n + i] * a[k * n + j];
+                tmp2 += b[k * n + i] * b[k * n + j];
+            }
+            sum += (tmp1 - tmp2) * (tmp1 - tmp2);
         }
-        for (; i < nelem; ++i) {
-            sum += A[i] * A[i];
-        }
-        return sqrt(sum);
+    }
+
+    return sqrt(sum);
+}
+
+
+/**
+ * Sum of squared elements.
+ *
+ * @param A matrix.
+ *
+ * @returns the sum of squared elements of `A`.
+ */
+double squareSum(const mat* A) {
+    int i, nn = A->nrow * A->ncol;
+    int nn4 = 4 * (nn / 4);
+    double *a = A->elem;
+    double s = 0.0;
+
+    for (i = 0; i < nn4; i += 4) {
+        s += (a[i]     * a[i])
+           + (a[i + 1] * a[i + 1])
+           + (a[i + 2] * a[i + 2])
+           + (a[i + 3] * a[i + 3]);
+    }
+    for (; i < nn; ++i) {
+        s += a[i] * a[i];
+    }
+
+    return s;
+}
+
+
+/**
+ * Row sums of a matrix `A`.
+ *
+ * @param A matrix of size `n x m`.
+ * @param sums vector of length `n`.
+ *
+ * @returns sums if not NULL, else a new creates vector of length `n`.
+ */
+mat* rowSums(const mat *A, mat *sums) {
+    int i, j, block_size, block_size_i,
+        nrow = A->nrow, ncol = A->ncol;
+    const double *a;
+    const double *A_block = A->elem;
+    const double *A_end = A->elem + nrow * ncol;
+    double *sum;
+
+    if (!sums) {
+        sums = zero(nrow, 1);
+    }
+    sum = sums->elem;
+
+    if (nrow > CVE_MEM_CHUNK_SIZE) {
+        block_size = CVE_MEM_CHUNK_SIZE;
     } else {
-        error("Unknown norm type.");
-    }
-}
-
-void matrixprod(const double *A, const int nrowA, const int ncolA,
-                const double *B, const int nrowB, const int ncolB,
-                double *C) {
-    const double one = 1.0;
-    const double zero = 0.0;
-
-    // DGEMM with parameterization:
-    //     C <- A %*% B
-    F77_NAME(dgemm)("N", "N", &nrowA, &ncolB, &ncolA,
-                    &one, A, &nrowA, B, &nrowB,
-                    &zero, C, &nrowA);
-}
-
-void crossprod(const double *A, const int nrowA, const int ncolA,
-               const double *B, const int nrowB, const int ncolB,
-               double *C) {
-    const double one = 1.0;
-    const double zero = 0.0;
-
-    // DGEMM with parameterization:
-    //     C <- t(A) %*% B
-    F77_NAME(dgemm)("T", "N", &ncolA, &ncolB, &nrowA,
-                    &one, A, &nrowA, B, &nrowB,
-                    &zero, C, &ncolA);
-}
-
-void nullProj(const double *B, const int nrowB, const int ncolB,
-              double *Q) {
-    const double minusOne = -1.0;
-    const double one = 1.0;
-    int i, nelem = nrowB * nrowB;
-
-    // Initialize as identity matrix.
-    memset(Q, 0, sizeof(double) * nrowB * nrowB);
-    for (i = 0; i < nelem; i += nrowB + 1) {
-        Q[i] = 1.0;
+        block_size = nrow;
     }
 
-    // DGEMM with parameterization:
-    //     C <- (-1.0 * B %*% t(B)) + C
-    F77_NAME(dgemm)("N", "T", &nrowB, &nrowB, &ncolB,
-                    &minusOne, B, &nrowB, B, &nrowB,
-                    &one, Q, &nrowB);
+    // Iterate `(block_size_i, ncol)` submatrix blocks.
+    for (i = 0; i < nrow; i += block_size_i) {
+        // Reset `A` to new block beginning.
+        a = A_block;
+        // Take block size of eveything left and reduce to max size.
+        block_size_i = nrow - i;
+        if (block_size_i > block_size) {
+            block_size_i = block_size;
+        }
+        // Compute first blocks column,
+        for (j = 0; j < block_size_i; ++j) {
+            sum[j] = a[j];
+        }
+        // and sum the following columns to the first one.
+        for (a += nrow; a < A_end; a += nrow) {
+            for (j = 0; j < block_size_i; ++j) {
+                sum[j] += a[j];
+            }
+        }
+        // Step one block forth.
+        A_block += block_size_i;
+        sum += block_size_i;
+    }
+
+    return sums;
 }
 
-// In place scaling of elements of A.
-void scale(const double s, double *A, const int nelem) {
-    int i, nelemb = (nelem / 4) * 4;
+/**
+ * Column sums of a matrix `A`.
+ *
+ * @param A matrix of size `n x m`.
+ * @param sums (out) vector of length `m`.
+ *
+ * @returns sums if not NULL, else a new creates vector of length `m`.
+ * 
+ * @details As a vector this the result `sums` must have matrix dim. `m x 1`.
+ * Meaning that the result is a row vector.
+ */
+mat* colSums(const mat *A, mat *sums) {
+    int i, j, nrow = A->nrow, ncol = A->ncol;
+    int nrowb = 4 * (nrow / 4); // 4 * floor(nrow / 4)
+    double *a = A->elem;
+    double sum;
 
-    if (s == 1.) {
-        return;
+    if (!sums) {
+        sums = matrix(ncol, 1);
     }
 
-    for (i = 0; i < nelemb; i += 4) {
-        A[i]     *= s;
-        A[i + 1] *= s;
-        A[i + 2] *= s;
-        A[i + 3] *= s;
-    }
-    for (; i < nelem; ++i) {
-        A[i] *= s;
+    for (j = 0; j < ncol; ++j) {
+        sum = 0.0;
+        for (i = 0; i < nrowb; i += 4) {
+            sum += a[i]
+                 + a[i + 1]
+                 + a[i + 2]
+                 + a[i + 3];
+        }
+        for (; i < nrow; ++i) {
+            sum += a[i];
+        }
+        sums->elem[j] = sum;
+        a += nrow;
     }
+
+    return sums;
 }
 
-// A dence skew-symmetric rank 2 update.
-// Perform the update
-//      C := alpha (A * B^T - B * A^T) + beta C
-void skew(const int nrow, const int ncol,
-          double alpha, const double *A, const double *B,
-          double beta,
-          double *C) {
+/**
+ * Sum of squared row differences.
+ *
+ *      d_k = \sum_{l} (X_{i l} - X_{j l})^2
+ *
+ * For a input matrix `X` of dimension `n x p` the result is a vector of length
+ * `n * (n - 1) / 2` (aka number of all pairs of `n` elements). The relation
+ * between `i, j` and `k` is:
+ * 
+ *      k = j n + i - j (j + 1) / 2, for 0 <= j < i < n
+ *
+ * @param X input matrix of dim. `n x p`.
+ * @param lvecD (out) output vector of length `n * (n - 1) / 2`.
+ *
+ * @returns lvecD if not NULL, otherwise a new created vector.
+ *
+ * @details this computes the packed vectorized lower triangular part of the
+ * distance matrix of `X`, aka lvec(D).
+ */
+mat* rowDiffSquareSums(const mat *X, mat *lvecD) {
+    int i, j, k, l, nrow = X->nrow, ncol = X->ncol;
+    const double *x;
+    double *d, diff;
+
+    if (!lvecD) {
+        lvecD = zero(nrow * (nrow - 1) / 2, 1);
+    } else if (nrow * (nrow - 1) == 2 * lvecD->nrow && lvecD->ncol == 1) {
+        memset(lvecD->elem, 0, (nrow * (nrow - 1) / 2) * sizeof(double));
+    } else {
+        // TODO: error handling!
+    }
+
+    d = lvecD->elem;
+    for (l = 0; l < ncol; ++l) {
+        x = X->elem + l * nrow;
+        for (j = k = 0; j < nrow; ++j) {
+            for (i = j + 1; i < nrow; ++i, ++k) {
+                diff = x[i] - x[j];
+                d[k] += diff * diff;
+            }
+        }
+    }
+
+    return lvecD;
+}
+
+
+#define COLAPPLY_ALG(op)                                                      \
+    for (i = 0; i < nrow; i += block_size) {                                  \
+        /* Set a, c to block's left upper corner and b to block height */     \
+        a = A->elem + i;                                                      \
+        b = B->elem + i;                                                      \
+        c = C->elem + i;                                                      \
+        /* Get block size as everyting left, then restrict size. */           \
+        block_size = nrow - i;                                                \
+        if (block_size > max_size) {                                          \
+            block_size = max_size;                                            \
+        }                                                                     \
+        /* remove reminders by 4 from block_size */                           \
+        block_batch_size = 4 * (block_size / 4);                              \
+        /* Stay on i-index "height" and move "right" in j-direction */        \
+        for (j = 0; j < ncol; ++j, a += nrow, c += nrow) {                    \
+            /*  Iterate over the block column */                              \
+            for (k = 0; k < block_batch_size; k += 4) {                       \
+                c[k]     = a[k]     op b[k];                                  \
+                c[k + 1] = a[k + 1] op b[k + 1];                              \
+                c[k + 2] = a[k + 2] op b[k + 2];                              \
+                c[k + 3] = a[k + 3] op b[k + 3];                              \
+            }                                                                 \
+            for (; k < block_size; ++k) {                                     \
+                c[k] = a[k] op b[k];                                          \
+            }                                                                 \
+        }                                                                     \
+    }
+
+/* C[, j] = A[, j] op v for each j = 1 to ncol with op as one of +, -, *, / */
+
+/**
+ * Elment-wise operation for each row to vector.
+ *      C <- A op B
+ * using recycling (column-major order) with `op` ether '+', '-', '*' or '/'.
+ *
+ * @param A matrix of size `n x m`.
+ * @param op type of operation, ether '+', '-', '*' or '/'.
+ * @param B vector or length `m` to be sweeped over rows.
+ * @param C (in/out) result after row sweep.
+ *
+ * @returns Passed (overwritten) `C` or new created if `C` is NULL.
+ */
+mat* colApply(mat *A, const char op, mat *B, mat *C) {
+    int i, j, k, nrow = A->nrow, ncol = A->ncol;
+    int max_size = CVE_MEM_CHUNK_SMALL < nrow ? CVE_MEM_CHUNK_SMALL : nrow;
+    int block_size, block_batch_size;
+    const double * restrict a, * restrict b;
+    double * restrict c;
+
+    /* Check if B length is the same as count A's rows. */
+    if (nrow != B->nrow) {
+        // TODO: error handling!
+    }
+    /* Check for existance or shape of C */
+    if (!C) {
+        C = matrix(nrow, ncol);
+    } else if (C->nrow != nrow || C->ncol != ncol) {
+        // TODO: error handling!
+    }
+
+    switch (op) {
+        case '+':
+            COLAPPLY_ALG(+)
+            break;
+        case '-':
+            COLAPPLY_ALG(-)
+            break;
+        case '*':
+            COLAPPLY_ALG(*)
+            break;
+        case '/':
+            // TODO: check division by zero
+            COLAPPLY_ALG(/)
+            break;
+        default:
+            // TODO: error handling:
+            break;
+    }
+
+    return C;
+}
+#undef COLAPPLY_ALG
+
+/**
+ * Element-wise applies `op` with scalar.
+ * 
+ *      B <- A + scalar,    for op = '+',
+ *      B <- A - scalar,    for op = '-',
+ *      B <- A * scalar,    for op = '*',
+ *      B <- A / scalar,    for op = '/'.
+ * 
+ * @param A matrix.
+ * @param op one of '+', '-', '*' or '/'.
+ * @param scalar scalar as right hand side of operation.
+ * @param B matrix of the same size as `A`. May even `A`.
+ */
+mat* elemApply(mat *A, const char op, const double scalar, mat *B) {
+    int i, n = A->nrow * A->ncol;
+    double *a, *b;
+
+    if (!B) {
+        B = matrix(A->nrow, A->ncol);
+    }
+
+    if (n != B->nrow * B->ncol || (op == '/' && scalar == 0.0)) {
+        // TODO: error handling!
+    }
+
+    a = A->elem;
+    b = B->elem;
+    switch (op) {
+        case '+':
+            for (i = 0; i < n; ++i) {
+                b[i] = a[i] + scalar;
+            }
+            break;
+        case '-':
+            for (i = 0; i < n; ++i) {
+                b[i] = a[i] - scalar;
+            }
+            break;
+        case '*':
+            for (i = 0; i < n; ++i) {
+                b[i] = a[i] * scalar;
+            }
+            break;
+        case '/':
+            for (i = 0; i < n; ++i) {
+                b[i] = a[i] / scalar;
+            }
+            break;
+        default:
+            break;
+    }
+
+    return B;
+}
+
+/**
+ * Linear combination of A with B,
+ *      B <- alpha A + beta B
+ *
+ * @param alpha scaling factor for `A`.
+ * @param A (in) first matrix.
+ * @param beta scaling factor for `B`.
+ * @param B (in/out) second matrix of the same length as `A`.
+ *
+ * @returns passed vector `B`.
+ *
+ * @details It is NOT allowed to pass NULL.
+ * Also the number of elements must match, but it is legal to pass matrices of
+ * different shape, the result may be unwanted but leagel.
+ */
+mat* lincomb(const double alpha, const mat *A, const double beta, mat *B) {
+    int i, nn = A->nrow * A->ncol;
+    int nnb = 4 * (nn / 4);
+    double *restrict a = A->elem, *restrict b = B->elem;
+
+    if (nn != B->nrow * B->ncol) {
+        // TODO: error handling!
+    }
+
+    for (i = 0; i < nnb; i += 4) {
+        b[i]     = alpha * a[i]     + beta * b[i];
+        b[i + 1] = alpha * a[i + 1] + beta * b[i + 1];
+        b[i + 2] = alpha * a[i + 2] + beta * b[i + 2];
+        b[i + 3] = alpha * a[i + 3] + beta * b[i + 3];
+    }
+    for (; i < nn; ++i) {
+        b[i] = alpha * a[i] + beta * b[i];
+    }
+
+    return B;
+}
+
+/**
+ * Matrix matrix product.
+ *      C <- alpha * A %*% B + beta * C
+ *
+ * @param alpha scaling factor for product of `A` with `B`.
+ * @param A (in) matrix of dimension `n x k`.
+ * @param B (in) matrix of dimension `k x m`.
+ * @param beta scaling factor for original values in target matrix.
+ * @param C (in/out) matrix of dimension `n x m` or NULL.
+ *
+ * @details if `C` is NULL, a new matrix of dimension `n x m` is created `beta`
+ *      is set to 0.
+ * 
+ * @returns given `C` or a new created matrix if `C` was NULL or NULL on error.
+ */
+mat* matrixprod(const double alpha, const mat *A, const mat *B,
+                double beta, mat* C) {
+    /* Check dimensions. */
+    if (A->ncol != B->nrow) {
+        // TODO: error handling!
+    }
+    /* Check if `C` is given. */
+    if (!C) {
+        /* with `beta` zero, DGEMM ensures to overwrite `C`. */
+        beta = 0.0;
+        /* Create uninitialized matrix `C`. */
+        C = matrix(A->nrow, B->ncol);
+    } else {
+        /* Check target dimensions. */
+        if (C->nrow != A->nrow || C->ncol != B->ncol) {
+            // TODO: error handling!
+        }
+    }
+
+    /* DGEMM ... Dence GEneralized Matrix Matrix level 3 BLAS routine */
+    F77_NAME(dgemm)("N", "N", &(A->nrow), &(B->ncol), &(A->ncol),
+                    &alpha, A->elem, &(A->nrow), B->elem, &(B->nrow),
+                    &beta, C->elem, &(C->nrow));
+
+    return C;
+}
+
+
+
+/**
+ * Cross-product.
+ *      C <- alpha * t(A) %*% B + beta * C
+ *      C <- alpha * crossprod(A, B) + beta * C
+ *
+ * @param alpha scaling factor for product of `A` with `B`.
+ * @param A (in) matrix of dimension `k x n`.
+ * @param B (in) matrix of dimension `k x m`.
+ * @param beta scaling factor for original values in target matrix.
+ * @param C (in/out) matrix of dimension `n x m` or NULL.
+ *
+ * @details if `C` is NULL, a new matrix of dimension `n x m` is created `beta`
+ *      is set to 0.
+ * 
+ * @returns given `C` or a new created matrix if `C` was NULL or NULL on error.
+ */
+mat* crossprod(const double alpha, const mat *A, const mat *B,
+                double beta, mat* C) {
+    /* Check dimensions. */
+    if (A->nrow != B->nrow) {
+        // TODO: error handling!
+    }
+    /* Check if `C` is given. */
+    if (!C) {
+        /* with `beta` zero, DGEMM ensures to overwrite `C`. */
+        beta = 0.0;
+        /* Create uninitialized matrix `C`. */
+        C = matrix(A->ncol, B->ncol);
+    } else {
+        /* Check target dimensions. */
+        if (C->nrow != A->ncol || C->ncol != B->ncol) {
+            // TODO: error handling!
+        }
+    }
+
+    /* DGEMM ... Dence GEneralized Matrix Matrix level 3 BLAS routine */
+    F77_NAME(dgemm)("T", "N", &(A->ncol), &(B->ncol), &(A->nrow),
+                    &alpha, A->elem, &(A->nrow), B->elem, &(B->nrow),
+                    &beta, C->elem, &(C->nrow));
+
+    return C;
+}
+
+/**
+ * Hadamard product (Schur product or element-wise product).
+ *
+ *      C <- alpha * A * B + beta * C
+ *
+ * @param alpha scaling factor for `A o B`.
+ * @param A matrix.
+ * @param B matrix of same dimensions as `A`.
+ * @param beta scaling factor for `C`.
+ * @param C (in/out) matrix of same dimensions as `A` or NULL.
+ *
+ * @returns Passed C, or if C is NULL a new created matrix.
+ */
+mat* hadamard(const double alpha, const mat* A, const mat* B,
+              double beta, mat* C) {
+    int i, nn = A->nrow * A->ncol;
+    double *a, *b, *c;
+
+    if (A->nrow != B->nrow || A->ncol != B->ncol) {
+        // TODO: error handling!
+    }
+
+    if (!C) {
+        beta = 0.0;
+        C = zero(A->nrow, A->ncol);
+    } else if (A->nrow != C->nrow || A->ncol != C->ncol) {
+        // TOD0: error handling!
+    }
+
+    a = A->elem;
+    b = B->elem;
+    c = C->elem;
+    if (alpha == 1.0 && beta == 0.0) {
+        for (i = 0; i < nn; ++i) {
+            c[i] = a[i] * b[i];
+        }
+    } else {
+        for (i = 0; i < nn; ++i) {
+            c[i] = alpha * a[i] * b[i] + beta * c[i];
+        }
+    }
+
+    return C;
+}
+
+/**
+ * Skew-Symmetric rank-2 matrixprod:
+ *
+ *      C <- alpha * (A %*% t(B) - B %*% t(A)) + beta * C
+ *
+ * @param alpha first scaling factor.
+ * @param A (in) matrix of dimension `n x k`.
+ * @param B (in) matrix of dimension `n x k`.
+ * @param beta scaling factor for original values in target matrix.
+ * @param C (in/out) matrix of dimension `n x n` or NULL.
+ *
+ * @details if `C` is NULL, a new matrix of dimension `n x n` is created, `beta`
+ *      is set to 0.
+ * 
+ * @returns given `C` or a new created matrix if `C` was NULL or NULL on error.
+ */
+mat* skew(double alpha, mat *A, mat *B, double beta, mat *C) {
+
+    if (A->nrow != B->nrow || A->ncol != B->ncol) {
+        // TODO: error handling!
+    }
+
+    if (!C) {
+        beta = 0.0;
+        C = matrix(A->nrow, A->nrow);
+    } else if (C->nrow != A->nrow || C->ncol != A->nrow) {
+        // TODO: error handling!
+    }
+
     F77_NAME(dgemm)("N", "T",
-             &nrow, &nrow, &ncol,
-             &alpha, A, &nrow, B, &nrow,
-             &beta, C, &nrow);
+                    &(C->nrow), &(C->ncol), &(A->ncol),
+                    &alpha, A->elem, &(A->nrow), B->elem, &(B->nrow),
+                    &beta, C->elem, &(C->nrow));
 
     alpha *= -1.0;
     beta = 1.0;
     F77_NAME(dgemm)("N", "T",
-             &nrow, &nrow, &ncol,
-             &alpha, B, &nrow, A, &nrow,
-             &beta, C, &nrow);
+                    &(C->nrow), &(C->ncol), &(B->ncol),
+                    &alpha, B->elem, &(B->nrow), A->elem, &(A->nrow),
+                    &beta, C->elem, &(C->nrow));
+
+    return C;
 }
 
+/**
+ * Integer square root.
+ *      isqrt(n) = floor(sqrt(n))
+ * which is the biggest positive integer such that its square is smaller or
+ * equal to `n`.
+ * 
+ * @param n positive integer.
+ * 
+ * @returns integer square root of `n`.
+ */
+static int isqrt(const int n) {
+    int x = 1, y;
 
-/** Cayley transformation of matrix `B` using the Skew-Symmetri matrix `A`.
- *      X = (I + A)^-1 (I - A) B
+    if (n < 0) {
+        // TODO: error handling!
+    } else if (n < 4) {
+        return 1;
+    } else if (n < 9) { // Only required for n <= 4.
+        return 2;
+    }
+
+    while (1) {
+        y = (x + n / x) / 2;
+        if (x == y || x + 1 == y) {
+            return x;
+        }
+        x = y;
+    }
+}
+
+/**
+ * Vector to symmetric matrix transform.
+ * 
+ * Given a vector `A` of length `N = n * (n - 1) / 2` a symmetric matrix `B`
+ * of dimension `n x n` is created such that (0-indexed)
+ * 
+ *      b_{i j} = b_{j i} = a_k,    for    k = j * n + i - j * (j + 1) / 2
+ * 
+ * with 0 <= j < i < n. The main diagonal elements are set to `diag`.
+ *
+ * @example The vector `A = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)^T` with
+ * `diag = -1` results in
+ * 
+ *                      ( -1  0  1  2  3  )
+ *                      (  0 -1  4  5  6  )
+ *                  B = (  1  4 -1  7  8  )
+ *                      (  2  5  7 -1  9  )
+ *                      (  3  6  8  9 -1  )
+ *
+ * @param A (in) source vector of length `N = n * (n - 1) / 2`.
+ * @param B (in/out) destination matrix of dim. `n x n`.
+ * @param diag value of all diagonal elements.
+ * 
+ * @returns passed matrix `B` or new created if `B` is NULL.
+ */
+mat* lvecToSym(const mat* A, const double diag, mat* B) {
+    int i, j, k, l, n, N = A->nrow;
+    int max_size, block_size;
+    double *a, *b, *L, *U;
+
+    if (!B) {
+        n = (1 + isqrt(1 + 8 * N)) / 2;
+        if (n * (n - 1) != 2 * N) {
+            // TODO: error handling!
+        }
+        B = matrix(n, n);
+    } else {
+        n = B->nrow;
+        if (n * (n - 1) != 2 * N || n != B->ncol) {
+            // TODO: error handling!
+        }
+    }
+
+    /* Copy consecutive values of `A` to lower triangular part of `B`. */
+    a = A->elem;
+    for (j = k = 0; j < n; ++j) {
+        b = B->elem + j * n;
+        b[j] = diag;
+        for (i = j + 1; i < n; ++i, ++k) {
+            b[i] = a[k];
+        }
+    }
+
+    /* Mirror along the main diagonal, aka transposed copy of lower to upper. */
+    b = B->elem;
+    max_size = BLOCK_SIZE * (n / BLOCK_SIZE);
+    /* Iterate over blocked columns */
+    for (j = 0; j < n; j += BLOCK_SIZE) {
+        /* determine height of the block (for reminding partial blocks) */
+        block_size = j < max_size ? BLOCK_SIZE : (n % BLOCK_SIZE);
+        /* Set diagonal block (left upper corner on the main diagonal) */
+        L = b + (j * n + j); // diagonal block.
+        /* Transpose block on main diagonal */
+        for (l = 0; l < block_size; ++l) {
+            for (k = l + 1; k < block_size; ++k) {
+                L[k * n + l] = L[l * n + k];
+            }
+        }
+        /* Iterate complete blocks below previous halve block */
+        for (i = j + BLOCK_SIZE; i < n; i += BLOCK_SIZE) {
+            /* determine height of the block (for reminding partial blocks) */
+            block_size = i < max_size ? BLOCK_SIZE : (n % BLOCK_SIZE);
+            /* Set lower (L, below diag) and upper (U, above diag) blocks */
+            L = b + j * n + i;
+            U = b + i * n + j;
+            /* Transpose lower (L) to upper (U) complete block. */
+            for (l = 0; l < BLOCK_SIZE; ++l) {
+                for (k = 0; k < block_size; ++k) {
+                    U[k * n + l] = L[l * n + k];
+                }
+            }
+        }
+    }
+
+    return B;
+}
+
+/**
+ * In place transformation into symmetric matrix as
+ *      A <- diag(colSums(A + t(A))) - (A + t(A))
+ *
+ * @param A (in/out) square matrix.
+ * @param workMem (out) double[n] vector as working memory. Will be storing the
+ * diagonal elements of the result matrix. Must have at least length `n`.
+ *
+ * @returns laplace transformed input `A`.
+ * 
+ * @details The workMem parameter is required as working memory.
+ */
+mat* laplace(mat *A, double *workMem) {
+    int i, j, k, l, n = A->nrow;
+    int max_size = BLOCK_SIZE * (n / BLOCK_SIZE);
+    int block_size;
+    double *a = A->elem;
+    double *L, *U;
+
+    /* Check if A is square */
+    if (n != A->ncol) {
+        // TODO: error handling!
+    }
+
+    /* Ckeck if working memory is supplied. */
+    if (!workMem) {
+        workMem = (double*)R_alloc(n, sizeof(double));
+    }
+
+    /* init working memory */
+    memset(workMem, 0, n * sizeof(double));
+
+    /* Iterate over column slices */
+    for (j = 0; j < n; j += BLOCK_SIZE) {
+        /* determine height of the block (for reminding partial blocks) */
+        block_size = j < max_size ? BLOCK_SIZE : (n % BLOCK_SIZE);
+        /* Set diagonal block (left upper corner on the main diagonal) */
+        L = a + (j * n + j); // D block in description.
+        /* Transpose block on main diagonal */
+        for (l = 0; l < block_size; ++l) {
+            for (k = l + 1; k < block_size; ++k) {
+                workMem[j + l] += L[l * n + k]
+                                = L[k * n + l]
+                                = -(L[l * n + k] + L[k * n + l]);
+            }
+        }
+        /* Iterate complete blocks below previous halve block */
+        for (i = j + BLOCK_SIZE; i < n; i += BLOCK_SIZE) {
+            /* determine height of the block (for reminding partial blocks) */
+            block_size = i < max_size ? BLOCK_SIZE : (n % BLOCK_SIZE);
+            /* Set lower (L, below diag) and upper (U, above diag) blocks */
+            L = a + j * n + i;
+            U = a + i * n + j;
+            /* Transpose lower (L) to upper (U) complete block. */
+            for (l = 0; l < BLOCK_SIZE; ++l) {
+                for (k = 0; k < block_size; ++k) {
+                    workMem[j + l] += L[l * n + k]
+                                    = U[k * n + l]
+                                    = -(L[l * n + k] + U[k * n + l]);
+                }
+            }
+        }
+    }
+    /* Sum remaining upper diagonal column sum parts and set diagonal */
+    for (j = 0; j < n; ++j) {
+        for (i = 0; i < j; ++i) {
+            workMem[j] += a[i];
+        }
+        a[j] = -workMem[j];
+        a += n;
+    }
+
+    return A;
+}
+
+/** Cayley transformation of matrix `B` using a Skew-Symmetric matrix `A`.
+ *
+ *      C = (I + A)^-1 (I - A) B
+ *
  * by solving the following linear equation:
- *      (I + A) X = (I - A) B ==> X = (I + A)^-1 (I - A) B
+ *
+ *      (I + A) C = (I - A) B ==> C = (I + A)^-1 (I - A) B
  *      \_____/     \_____/
- *        IpA   X =   ImA   B
+ *        IpA   C =   ImA   B
  *                  \_______/
- *        IpA   X =     Y     ==>  X = IpA^-1 Y
+ *        IpA   C =     Y     ==>  C = IpA^-1 Y
  *
  * @param A Skew-Symmetric matrix of dimension `(n, n)`.
  * @param B Matrix of dimensions `(n, m)` with `m <= n`.
- * @return Transformed matrix `X`.
+ * @param C Matrix of dimensions `(n, m)` with `m <= n`.
+ * @param workMem working memory array of length at least `2 p^2 + p`.
+ * @return Transformed matrix `C`.
  * @note This opperation is equivalent to the R expression:
  *      solve(diag(1, n) + A) %*% (diag(1, n) - A) %*% B
  * or
  *      solve(diag(1, n) + A, (diag(1, n) - A) %*% B)
  */
-void cayleyTransform(const int p, const int q,
-                     const double *A, const double *B,
-                     double *X, double *workMem) {
-    int i, info, pp = p * p;
-    double zero = 0., one = 1.;
+mat* cayleyTransform(mat *A, mat *B, mat *C, double *workMem) {
+    int i, info, pp = A->nrow * A->nrow;
+    double zero = 0.0, one = 1.0;
+
+    // TODO: validate dimensions!!!
+    // TODO: calrify a bit ^^
+
+    if (!C) {
+        C = matrix(B->nrow, B->ncol);
+    } else if (B->nrow != C->nrow || B->ncol != C->ncol) {
+        // TODO: error handling!
+    }
 
     /* Allocate row permutation array used by `dgesv` */
     int *ipiv = (int*)workMem;
 
     /* Create Matrix IpA = I + A (I plus A) */
-    double *IpA = (double*)(ipiv + p);
-    memcpy(IpA, A, pp * sizeof(double));
-    for (i = 0; i < pp; i += p + 1) {
+    double *IpA = (double*)(ipiv + A->nrow);
+    memcpy(IpA, A->elem, A->nrow * A->ncol * sizeof(double));
+    for (i = 0; i < pp; i += A->nrow + 1) {
         IpA[i] += 1.; // +1 to diagonal elements.
     }
 
     /* Create Matrix ImA = I - A (I minus A) */
     double *ImA = IpA + pp;
+    double *a = A->elem;
     for (i = 0; i < pp; ++i) {
-        ImA[i] = -A[i];
+        ImA[i] = -a[i];
     }
-    for (i = 0; i < pp; i += p + 1) {
-        ImA[i] += 1.; // +1 to diagonal elements.
+    for (i = 0; i < pp; i += A->nrow + 1) {
+        ImA[i] += 1.0; // +1 to diagonal elements.
     }
 
     /* Y as matrix-matrix product of ImA and B:
      * Y = 1 * ImA * B + 0 * Y */
-    F77_CALL(dgemm)("N", "N", &p, &q, &p,
-                    &one, ImA, &p, B, &p, &zero, X, &p);
+    F77_CALL(dgemm)("N", "N", &(A->nrow), &(B->ncol), &(A->nrow),
+                    &one, ImA, &(A->nrow), B->elem, &(B->nrow),
+                    &zero, C->elem, &(C->nrow));
 
-    /* Solve system IpA Y = X for Y (and store result in X).
-     * aka. X = IpA^-1 X */
-    F77_CALL(dgesv)(&p, &q, IpA, &p, ipiv, X, &p, &info);
+    /* Solve system IpA Y = C for Y (and store result in C).
+     * aka. C = IpA^-1 X */
+    F77_CALL(dgesv)(&(A->nrow), &(B->ncol), IpA, &(A->nrow),
+                    ipiv, C->elem, &(A->nrow), &info);
 
     if (info) {
         error("[ cayleyTransform ] error in dgesv - info %d", info);
     }
+
+    return C;
 }
diff --git a/CVE_C/src/rStiefel.c b/CVE_C/src/rStiefel.c
index 5d84380..6ba7f2f 100644
--- a/CVE_C/src/rStiefel.c
+++ b/CVE_C/src/rStiefel.c
@@ -1,66 +1,31 @@
 #include "cve.h"
 
-// /**
-//  * Performas a QR factorization and computes the Q factor.
-//  * 
-//  * @param A matrix.
-//  * @returns The Q factor of the QR factorization `A = QR`.
-// */
-// SEXP qrQ(SEXP Ain) {
-//     int i, j, info;
-
-//     if (!isMatrix(Ain)) {
-//         error("Argument must be a matrix.");
-//     }
-//     int nrow = nrows(Ain);
-//     int ncol = ncols(Ain);
-
-//     double *A = (double*)R_alloc(nrow * ncol, sizeof(double));
-//     memcpy(A, REAL(Ain), nrow * ncol * sizeof(double));
-
-//     // double *A = REAL(Ain);
-//     // Scalar factors of elementary reflectors.
-//     double *tau = (double*)R_alloc(ncol, sizeof(double));
-
-//     // Create Working memory area.
-//     int lenWork = 3 * nrow;
-//     double *memWork = (double*)R_alloc(lenWork, sizeof(double));
-
-//     F77_NAME(dgeqrf)(&nrow, &ncol, A, &nrow, tau,
-//                      memWork, &lenWork, &info);
-
-//     SEXP Qout = PROTECT(allocMatrix(REALSXP, nrow, ncol));
-//     double *Q = REAL(Qout);
-
-//     for (j = 0; j < ncol; ++j) {
-//         for (i = 0; i < nrow; ++i) {
-//             if (i == j) {
-//                 Q[i + nrow * j] = 1.;
-//             } else {
-//                 Q[i + nrow * j] = 0.;
-//             }
-//         }
-//     }
-
-//     F77_NAME(dormqr)("L", "N", &nrow, &ncol, &ncol, A, &nrow, tau, Q, &nrow,
-//                      memWork, &lenWork, &info);
-
-//     UNPROTECT(1);
-//     return Qout;
-// }
-
 /**
- *  Draws a sample from invariant measure on the Stiefel manifold \eqn{S(p, q)}.
+ * Draws a sample from invariant measure on the Stiefel manifold \eqn{S(p, q)}.
  *
- *  @param p row dimension
- *  @param q col dimension
- *  @return \code{p} times \code{q} semi-orthogonal matrix.
- *  `V <- qr.Q(qr(matrix(rnorm(p * q, 0, 1), p, q)))`
+ * @param p row dimension
+ * @param q column dimension
+ * @param V (in/out) matrix of dimensions `p x q` or NULL.
+ * @param workMem work space array of length greater-equal than `2pq + q`.
+ *
+ * @return Passed matrix `V` or new created if `V` is NULL.
+ *
+ * @example Performs the same operation as the following `R` code:
+ *      V <- qr.Q(qr(matrix(rnorm(p * q, 0, 1), p, q)))
+ * 
+ * @details ATTENTION: The length of workMem must be at least `2pq + q`.
  */
-void rStiefel(const int p, const int q, double *V,
-             double *workMem, int workLen) {
-    int i, j, info;
+mat* rStiefel(const int p, const int q, mat *V, double *workMem) {
+    int i, j, info, workLen = 2 * p * q + q;
     int pq = p * q;
+    double *v;
+
+    if (!V) {
+        V = matrix(p, q);
+    } else if (V->nrow != p || V->ncol != q) {
+        // TODO: error handling!
+    }
+    v = V->elem;
 
     GetRNGstate();
     for (i = 0; i < pq; ++i) {
@@ -76,14 +41,12 @@ void rStiefel(const int p, const int q, double *V,
 
     for (j = 0; j < q; ++j) {
         for (i = 0; i < p; ++i) {
-            if (i == j) {
-                V[i + p * j] = 1.;
-            } else {
-                V[i + p * j] = 0.;
-            }
+            v[p * j + i] = i == j ? 1.0 : 0.0;
         }
     }
 
-    F77_NAME(dormqr)("L", "N", &p, &q, &q, workMem, &p, tau, V, &p,
+    F77_NAME(dormqr)("L", "N", &p, &q, &q, workMem, &p, tau, V->elem, &p,
                      workMem + pq + q, &workLen, &info);
+
+    return V;
 }
diff --git a/CVE_C/src/rowColOp.c b/CVE_C/src/rowColOp.c
deleted file mode 100644
index 2d9e1c2..0000000
--- a/CVE_C/src/rowColOp.c
+++ /dev/null
@@ -1,160 +0,0 @@
-#include "cve.h"
-
-/**
- * Computes the row sums of a matrix `A`.
- * @param A Pointer to col-major matrix elements, size is `nrow * ncol`.
- * @param nrow Number of rows of `A`.
- * @param ncol Number of columns of `A`.
- * @param sum Pointer to output row sums of size `nrow`.
- */
-void rowSums(const double *A, const int nrow, const int ncol,
-             double *sum) {
-    int i, j, block_size, block_size_i;
-    const double *A_block = A;
-    const double *A_end = A + nrow * ncol;
-
-    if (nrow > CVE_MEM_CHUNK_SIZE) {
-        block_size = CVE_MEM_CHUNK_SIZE;
-    } else {
-        block_size = nrow;
-    }
-
-    // Iterate `(block_size_i, ncol)` submatrix blocks.
-    for (i = 0; i < nrow; i += block_size_i) {
-        // Reset `A` to new block beginning.
-        A = A_block;
-        // Take block size of eveything left and reduce to max size.
-        block_size_i = nrow - i;
-        if (block_size_i > block_size) {
-            block_size_i = block_size;
-        }
-        // Compute first blocks column,
-        for (j = 0; j < block_size_i; ++j) {
-            sum[j] = A[j];
-        }
-        // and sum the following columns to the first one.
-        for (A += nrow; A < A_end; A += nrow) {
-            for (j = 0; j < block_size_i; ++j) {
-                sum[j] += A[j];
-            }
-        }
-        // Step one block forth.
-        A_block += block_size_i;
-        sum += block_size_i;
-    }
-}
-
-void colSums(const double *A, const int nrow, const int ncol,
-             double *colSums) {
-    int i, j;
-    int nrowb = 4 * (nrow / 4); // 4 * floor(nrow / 4)
-    double colSum;
-
-    for (j = 0; j < ncol; ++j) {
-        colSum = 0.0;
-        for (i = 0; i < nrowb; i += 4) {
-            colSum += A[i]
-                    + A[i + 1]
-                    + A[i + 2]
-                    + A[i + 3];
-        }
-        for (; i < nrow; ++i) {
-            colSum += A[i];
-        }
-        *(colSums++) = colSum;
-        A += nrow;
-    }
-}
-
-void rowSquareSums(const double *A,
-                   const int nrow, const int ncol,
-                   double *sum) {
-    int i, j, block_size, block_size_i;
-    const double *A_block = A;
-    const double *A_end = A + nrow * ncol;
-
-    if (nrow > CVE_MEM_CHUNK_SIZE) {
-        block_size = CVE_MEM_CHUNK_SIZE;
-    } else {
-        block_size = nrow;
-    }
-
-    // Iterate `(block_size_i, ncol)` submatrix blocks.
-    for (i = 0; i < nrow; i += block_size_i) {
-        // Reset `A` to new block beginning.
-        A = A_block;
-        // Take block size of eveything left and reduce to max size.
-        block_size_i = nrow - i;
-        if (block_size_i > block_size) {
-            block_size_i = block_size;
-        }
-        // Compute first blocks column,
-        for (j = 0; j < block_size_i; ++j) {
-            sum[j] = A[j] * A[j];
-        }
-        // and sum the following columns to the first one.
-        for (A += nrow; A < A_end; A += nrow) {
-            for (j = 0; j < block_size_i; ++j) {
-                sum[j] += A[j] * A[j];
-            }
-        }
-        // Step one block forth.
-        A_block += block_size_i;
-        sum += block_size_i;
-    }
-}
-
-void rowSumsSymVec(const double *Avec, const int nrow,
-                   const double diag,
-                   double *sum) {
-    int i, j;
-
-    if (diag == 0.0) {
-        memset(sum, 0, nrow * sizeof(double));
-    } else {
-        for (i = 0; i < nrow; ++i) {
-            sum[i] = diag;
-        }
-    }
-
-    for (j = 0; j < nrow; ++j) {
-        for (i = j + 1; i < nrow; ++i, ++Avec) {
-            sum[j] += *Avec;
-            sum[i] += *Avec;
-        }
-    }
-}
-
-void rowDiffs(const double* X, const int nrow, const int ncol,
-              double *diffs) {
-    int i, j, k, l;
-    const double *Xcol;
-
-    for (k = l = 0; l < ncol; ++l) {
-        Xcol = X + l * nrow;
-        for (i = 0; i < nrow; ++i) {
-            for (j = i + 1; j < nrow; ++j) {
-                diffs[k++] = Xcol[i] - Xcol[j];
-            }
-        }
-    }
-}
-
-void rowDiffSquareSums(const double* X, const int nrow, const int ncol,
-                       double *sum) {
-    int i, j, k, l;
-    const double *Xcol;
-    double tmp;
-
-    memset(sum, 0, ((nrow * (nrow - 1)) / 2) * sizeof(double));
-
-    for (l = 0; l < ncol; ++l) {
-        Xcol = X + l * nrow;
-        for (k = i = 0; i < nrow; ++i) {
-            for (j = i + 1; j < nrow; ++j, ++k) {
-                tmp = Xcol[i] - Xcol[j];
-                sum[k] += tmp * tmp;
-            }
-        }
-    }
-}
diff --git a/CVE_C/src/sweep.c b/CVE_C/src/sweep.c
deleted file mode 100644
index e1a3e23..0000000
--- a/CVE_C/src/sweep.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "cve.h"
-
-#define ROW_SWEEP_ALG(op)                                                     \
-    /* Iterate `(block_size_i, ncol)` submatrix blocks. */                    \
-    for (i = 0; i < nrow; i += block_size_i) {                                \
-        /* Set `A` and `C` to block beginning. */                             \
-        A = A_block;                                                          \
-        C = C_block;                                                          \
-        /* Get current block's row size. */                                   \
-        block_size_i = nrow - i;                                              \
-        if (block_size_i > block_size) {                                      \
-            block_size_i = block_size;                                        \
-        }                                                                     \
-        /* Perform element wise operation for block. */                       \
-        for (; A < A_end; A += nrow, C += nrow) {                             \
-            for (j = 0; j < block_size_i; ++j) {                              \
-                C[j] = (A[j]) op (v[j]);                                      \
-            }                                                                 \
-        }                                                                     \
-        /* Step one block forth. */                                           \
-        A_block += block_size_i;                                              \
-        C_block += block_size_i;                                              \
-        v += block_size_i;                                                    \
-    }
-
-/* C[, j] = A[, j] op v for each j = 1 to ncol with op as one of +, -, *, / */
-void rowSweep(const double *A, const int nrow, const int ncol,
-              const char* op,
-              const double *v, // vector of length nrow
-              double *C) {
-    int i, j, block_size, block_size_i;
-    const double *A_block = A;
-    double *C_block = C;
-    const double *A_end = A + nrow * ncol;
-
-    if (nrow > CVE_MEM_CHUNK_SMALL) { // small because 3 vectors in cache
-        block_size = CVE_MEM_CHUNK_SMALL;
-    } else {
-        block_size = nrow;
-    }
-
-    if (*op == '+') {
-        ROW_SWEEP_ALG(+)
-    } else if (*op == '-') {
-        ROW_SWEEP_ALG(-)
-    } else if (*op == '*') {
-        ROW_SWEEP_ALG(*)
-    } else if (*op == '/') {
-        ROW_SWEEP_ALG(/)
-    }
-}
diff --git a/README.md b/README.md
index 7469d4b..3a7f202 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,8 @@ Performance:
 - [NOT Feasible] Stochastic Version
 - [NOT Feasible] Gradient Approximations (using Algebraic Software for alternative Loss function formulations and gradient optimizations)
 - [NOT Sufficient] Alternative Kernels for reducing samples
-- [ ] (To Be further investigated) "Kronecker" optimization
+- [x] (To Be further investigated) "Kronecker" optimization
+- [ ] Implement "Kronecker" optimization
 
 Features (functions):
 - [x] Initial `V.init` parameter (only ONE try, ignore number of `attempts` parameter)
@@ -48,6 +49,7 @@ Changes:
 Errors:
 - [x] `CVE_C` compare to `CVE_legacy`.
 - [x] fix: `predict.dim` not found.
+- [ ] fix/check: error computation for break condition (in `cve.c`)
 
 # Development
 ## Build and install.
diff --git a/runtime_test.R b/runtime_test.R
index aab3e9c..9589537 100644
--- a/runtime_test.R
+++ b/runtime_test.R
@@ -23,11 +23,11 @@ subspace.dist <- function(B1, B2){
 set.seed(437)
 
 # Number of simulations
-SIM.NR <- 50
+SIM.NR <- 50L
 # maximal number of iterations in curvilinear search algorithm
-MAXIT <- 50
+MAXIT <- 50L
 # number of arbitrary starting values for curvilinear optimization
-ATTEMPTS <- 10
+ATTEMPTS <- 10L
 # set names of datasets
 dataset.names <- c("M1", "M2", "M3", "M4", "M5")
 # Set used CVE method
diff --git a/test.R b/test.R
index d510e4a..bd714f6 100644
--- a/test.R
+++ b/test.R
@@ -17,10 +17,10 @@ library(CVE)
 path <- paste0('~/Projects/CVE/tmp/logger_', method, '_', momentum, '.C.pdf')
 
 # Define logger for `cve()` method.
-logger <- function(iter, attempt, data) {
+logger <- function(attempt, iter, data) {
     # Note the `<<-` assignement!
     loss.history[iter + 1, attempt] <<- data$loss
-    error.history[iter + 1, attempt] <<- if (data$err > 0) data$err else NA
+    error.history[iter + 1, attempt] <<- data$err
     tau.history[iter + 1, attempt] <<- data$tau
     # Compute true error by comparing to the true `B`
     B.est <- null(data$V) # Function provided by CVE