From 47917fe0bd1f203e57760a061e4a877c864beaca Mon Sep 17 00:00:00 2001 From: daniel Date: Tue, 3 Sep 2019 20:43:34 +0200 Subject: [PATCH] add: to notes, fix: bottleneck in grad --- CVE_R/R/gradient.R | 12 ++++++++- notes.md | 61 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 3 deletions(-) diff --git a/CVE_R/R/gradient.R b/CVE_R/R/gradient.R index 4ece9e5..7ffcb69 100644 --- a/CVE_R/R/gradient.R +++ b/CVE_R/R/gradient.R @@ -67,7 +67,17 @@ grad <- function(X, Y, V, h, vecS <- vecS * vecD # The gradient. - G <- crossprod(X_diff, sweep(X_diff %*% V, 1, vecS, `*`)) + # 1. The `crossprod(A, B)` is equivalent to `t(A) %*% B`, + # 2. `(X_diff %*% V) * vecS` is first a marix matrix mult. and then using + # recycling to scale each row with the values of `vecS`. + # Note that `vecS` is a vector and that `R` uses column-major ordering + # of matrices. + # (See: notes for more details) + # TODO: Depending on n, p, q decide which version to take (for current + # datasets "inner" is faster, see: notes). + # inner = crossprod(X_diff, X_diff * vecS) %*% V, + # outer = crossprod(X_diff, (X_diff %*% V) * vecS) + G <- crossprod(X_diff, X_diff * vecS) %*% V G <- (-2 / (n * h^2)) * G return(G) } diff --git a/notes.md b/notes.md index bfade3c..8f36f0a 100644 --- a/notes.md +++ b/notes.md @@ -33,9 +33,15 @@ install.packages(path, repos = NULL, type = "source") The runtime tests (upcomming further tests) are creating log files saved in `tmp/`. These log files are `CSV` files (actualy `TSV`) with a header storing the test results. Depending on the test the files may contain differnt data. As an example we use the runtime test logs which store in each line the `dataset`, the used `method` as well as the `error` (actual error of estimated `B` against real `B`) and the `time`. For reading and analysing the data see the following example. ```R # Load log as `data.frame` -test0 <- read.csv('tmp/test0.log', sep = '\t') +log <- read.csv('tmp/test0.log', sep = '\t') # Create a error boxplot grouped by dataset. -boxplot(error ~ dataset, test0) +boxplot(error ~ dataset, log) + +# Overview +for (ds.name in paste0('M', seq(5))) { + ds <- subset(log, dataset == ds.name, select = c('method', 'dataset', 'time', 'error')) + print(summary(ds)) +} ``` ## Environments and variable lookup. @@ -187,6 +193,57 @@ microbenchmark( # tcrossprod(V, V) 1.003 1.1575 1.28451 1.2400 1.3685 2.742 100 ``` +### Recycling vs. Sweep +```R +(n <- 200) +(p <- 12) +(q <- 10) +X_diff <- matrix(runif(n * (n - 1) / 2 * p), n * (n - 1) / 2, p) +V <- matrix(rnorm(p * q), p, q) +vecS <- runif(n * (n - 1) / 2) + +stopifnot( + all.equal((X_diff %*% V) * rep(vecS, q), + sweep(X_diff %*% V, 1, vecS, `*`)), + all.equal((X_diff %*% V) * rep(vecS, q), + (X_diff %*% V) * vecS) +) +microbenchmark( + rep = (X_diff %*% V) * rep(vecS, q), + sweep = sweep(X_diff %*% V, 1, vecS, `*`, check.margin = FALSE), + recycle = (X_diff %*% V) * vecS +) +# Unit: microseconds +# expr min lq mean median uq max neval +# rep 851.723 988.3655 1575.639 1203.6385 1440.578 18999.23 100 +# sweep 1313.177 1522.4010 2355.269 1879.2605 2065.399 18783.24 100 +# recycle 719.001 786.1265 1157.285 881.8825 1163.202 19091.79 100 +``` +### Scaled `crossprod` with matmul order. +```R +(n <- 200) +(p <- 12) +(q <- 10) +X_diff <- matrix(runif(n * (n - 1) / 2 * p), n * (n - 1) / 2, p) +V <- matrix(rnorm(p * q), p, q) +vecS <- runif(n * (n - 1) / 2) + +ref <- crossprod(X_diff, X_diff * vecS) %*% V +stopifnot( + all.equal(ref, crossprod(X_diff, (X_diff %*% V) * vecS)), + all.equal(ref, crossprod(X_diff, (X_diff %*% V) * vecS)) +) +microbenchmark( + inner = crossprod(X_diff, X_diff * vecS) %*% V, + outer = crossprod(X_diff, (X_diff %*% V) * vecS) +) +# Unit: microseconds +# expr min lq mean median uq max neval +# inner 789.065 867.939 1683.812 987.9375 1290.055 16800.265 100 +# outer 1141.479 1216.929 1404.702 1317.7315 1582.800 2531.766 100 +``` + + ## Using `Rprof()` for performance. The standart method for profiling where an algorithm is spending its time is with `Rprof()`. ```R