2
0
Fork 0

add: cve_linesearch,

fix: cleaned and optimized gradient,
add. notes
This commit is contained in:
Daniel Kapla 2019-09-02 21:07:56 +02:00
parent 1c120ec67c
commit 7d4d01a9a7
11 changed files with 397 additions and 60 deletions

View File

@ -6,6 +6,7 @@ export(col.pair.apply)
export(cve)
export(cve.call)
export(cve.grid.search)
export(cve_linesearch)
export(cve_sgd)
export(cve_simple)
export(dataset)

View File

@ -94,6 +94,8 @@ cve.call <- function(X, Y, method = "simple", nObs = nrow(X)^.5, k, ...) {
method <- tolower(method)
if (method == 'simple') {
dr <- cve_simple(X, Y, k, nObs = nObs, ...)
} else if (method == 'linesearch') {
dr <- cve_linesearch(X, Y, k, nObs = nObs, ...)
} else if (method == 'sgd') {
dr <- cve_sgd(X, Y, k, nObs = nObs, ...)
} else {

163
CVE_R/R/cve_linesearch.R Normal file
View File

@ -0,0 +1,163 @@
#' Implementation of the CVE method using curvilinear linesearch with Armijo-Wolfe
#' conditions.
#'
#' @keywords internal
#' @export
cve_linesearch <- function(X, Y, k,
nObs = sqrt(nrow(X)),
h = NULL,
tau = 1.0,
tol = 1e-3,
rho1 = 0.1,
rho2 = 0.9,
slack = 0,
epochs = 50L,
attempts = 10L,
max.linesearch.iter = 10L
) {
# Set `grad` functions environment to enable if to find this environments
# local variabels, needed to enable the manipulation of this local variables
# from within `grad`.
environment(grad) <- environment()
# Setup histories.
loss.history <- matrix(NA, epochs, attempts)
error.history <- matrix(NA, epochs, attempts)
tau.history <- matrix(NA, epochs, attempts)
# Get dimensions.
n <- nrow(X)
p <- ncol(X)
q <- p - k
# Save initial learning rate `tau`.
tau.init <- tau
# Addapt tolearance for break condition.
tol <- sqrt(2 * q) * tol
# Estaimate bandwidth if not given.
if (missing(h) | !is.numeric(h)) {
h <- estimate.bandwidth(X, k, nObs)
}
# Compute persistent data.
# Compute lookup indexes for symmetrie, lower/upper
# triangular parts and vectorization.
pair.index <- elem.pairs(seq(n))
i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
# Matrix of vectorized indices. (vec(index) -> seq)
index <- matrix(seq(n * n), n, n)
lower <- index[lower.tri(index)]
upper <- t(index)[lower]
# Create all pairewise differences of rows of `X`.
X_diff <- X[i, , drop = F] - X[j, , drop = F]
# Identity matrix.
I_p <- diag(1, p)
# Init tracking of current best (according multiple attempts).
V.best <- NULL
loss.best <- Inf
# Start loop for multiple attempts.
for (attempt in 1:attempts) {
# Sample a `(p, q)` dimensional matrix from the stiefel manifold as
# optimization start value.
V <- rStiefl(p, q)
# Initial loss and gradient.
loss <- Inf
G <- grad(X, Y, V, h, loss.out = TRUE, persistent = TRUE)
# Set last loss (aka, loss after applying the step).
loss.last <- loss
## Start optimization loop.
for (epoch in 1:epochs) {
# Cayley transform matrix `A`
A <- (G %*% t(V)) - (V %*% t(G))
# Directional derivative of the loss at current position, given
# as `Tr(G^T \cdot A \cdot V)`.
loss.prime <- -0.5 * norm(A, type = 'F')^2
# Linesearch
tau.upper <- Inf
tau.lower <- 0
tau <- tau.init
for (iter in 1:max.linesearch.iter) {
# Apply learning rate `tau`.
A.tau <- (tau / 2) * A
# Parallet transport (on Stiefl manifold) into direction of `G`.
inv <- solve(I_p + A.tau)
V.tau <- inv %*% ((I_p - A.tau) %*% V)
# Loss at position after a step.
loss <- Inf # aka loss.tau
G.tau <- grad(X, Y, V.tau, h, loss.out = TRUE, persistent = TRUE)
# Armijo condition.
if (loss > loss.last + (rho1 * tau * loss.prime)) {
tau.upper <- tau
tau <- (tau.lower + tau.upper) / 2
next()
}
V.prime.tau <- -0.5 * inv %*% A %*% (V + V.tau)
loss.prime.tau <- sum(G * V.prime.tau) # Tr(grad(tau)^T \cdot Y^'(tau))
# Wolfe condition.
if (loss.prime.tau < rho2 * loss.prime) {
tau.lower <- tau
if (tau.upper == Inf) {
tau <- 2 * tau.lower
} else {
tau <- (tau.lower + tau.upper) / 2
}
} else {
break()
}
}
# Compute error.
error <- norm(V %*% t(V) - V.tau %*% t(V.tau), type = "F")
# Write history.
loss.history[epoch, attempt] <- loss
error.history[epoch, attempt] <- error
tau.history[epoch, attempt] <- tau
# Check break condition (epoch check to skip ignored gradient calc).
# Note: the devision by `sqrt(2 * k)` is included in `tol`.
if (error < tol | epoch >= epochs) {
# take last step and stop optimization.
V <- V.tau
break()
}
# Perform the step and remember previous loss.
V <- V.tau
loss.last <- loss
G <- G.tau
}
# Check if current attempt improved previous ones
if (loss < loss.best) {
loss.best <- loss
V.best <- V
}
}
return(list(
loss.history = loss.history,
error.history = error.history,
tau.history = tau.history,
loss = loss.best,
V = V.best,
B = null(V.best),
h = h
))
}

View File

@ -7,10 +7,20 @@ cve_sgd <- function(X, Y, k,
nObs = sqrt(nrow(X)),
h = NULL,
tau = 0.01,
tol = 1e-3,
epochs = 50L,
batch.size = 16L,
attempts = 10L
) {
# Set `grad` functions environment to enable if to find this environments
# local variabels, needed to enable the manipulation of this local variables
# from within `grad`.
environment(grad) <- environment()
# Setup histories.
loss.history <- matrix(NA, epochs, attempts)
error.history <- matrix(NA, epochs, attempts)
# Get dimensions.
n <- nrow(X) # Number of samples.
p <- ncol(X) # Data dimensions
@ -18,15 +28,31 @@ cve_sgd <- function(X, Y, k,
# Save initial learning rate `tau`.
tau.init <- tau
# Addapt tolearance for break condition.
tol <- sqrt(2 * q) * tol
# Estaimate bandwidth if not given.
if (missing(h) | !is.numeric(h)) {
h <- estimate.bandwidth(X, k, nObs)
}
# Compute persistent data.
# Compute lookup indexes for symmetrie, lower/upper
# triangular parts and vectorization.
pair.index <- elem.pairs(seq(n))
i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
# Matrix of vectorized indices. (vec(index) -> seq)
index <- matrix(seq(n * n), n, n)
lower <- index[lower.tri(index)]
upper <- t(index)[lower]
# Create all pairewise differences of rows of `X`.
X_diff <- X[i, , drop = F] - X[j, , drop = F]
# Identity matrix.
I_p <- diag(1, p)
# Init a list of data indices (shuffled for batching).
indices <- seq(n)
I_p <- diag(1, p)
# Init tracking of current best (according multiple attempts).
V.best <- NULL
@ -40,6 +66,8 @@ cve_sgd <- function(X, Y, k,
# Sample a `(p, q)` dimensional matrix from the stiefel manifold as
# optimization start value.
V <- rStiefl(p, q)
# Keep track of last `V` for computing error after an epoch.
V.last <- V
# Repeat `epochs` times
for (epoch in 1:epochs) {
@ -55,7 +83,7 @@ cve_sgd <- function(X, Y, k,
# Compute batch gradient.
loss <- NULL
G <- grad(X[batch, ], Y[batch], V, h)
G <- grad(X[batch, ], Y[batch], V, h, loss.out = TRUE)
# Cayley transform matrix.
A <- (G %*% t(V)) - (V %*% t(G))
@ -65,27 +93,34 @@ cve_sgd <- function(X, Y, k,
# Parallet transport (on Stiefl manifold) into direction of `G`.
V <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V)
}
}
# Compute actuall loss after finishing optimization.
loss <- grad(X, Y, V, h, loss.only = TRUE)
# After each attempt, check if last attempt reached a better result.
if (!is.null(V.best)) { # Only required if there is already a result.
if (loss < loss.best) {
loss.best <- loss
V.best <- V
# Compute actuall loss after finishing optimization.
loss <- grad(X, Y, V, h, loss.only = TRUE, persistent = TRUE)
# And the error for the history.
error <- norm(V.last %*% t(V.last) - V %*% t(V), type = "F")
V.last <- V
# Finaly write history.
loss.history[epoch, attempt] <- loss
error.history[epoch, attempt] <- error
# Check break condition.
if (error < tol) {
break()
}
} else {
}
# After each attempt, check if last attempt reached a better result.
if (loss < loss.best) {
loss.best <- loss
V.best <- V
}
}
return(list(
X = X, Y = Y, k = k,
nObs = nObs, h = h, tau = tau,
epochs = epochs, batch = batch, attempts = attempts,
loss.history = loss.history,
error.history = error.history,
loss = loss.best,
V = V.best,
B = null(V.best)
B = null(V.best),
h = h
))
}

View File

@ -17,8 +17,9 @@ cve_simple <- function(X, Y, k,
# from within `grad`.
environment(grad) <- environment()
# Setup loss histroy.
loss.history <- matrix(NA, epochs, attempts);
# Setup histories.
loss.history <- matrix(NA, epochs, attempts)
error.history <- matrix(NA, epochs, attempts)
# Get dimensions.
n <- nrow(X)
@ -35,6 +36,20 @@ cve_simple <- function(X, Y, k,
h <- estimate.bandwidth(X, k, nObs)
}
# Compute persistent data.
# Compute lookup indexes for symmetrie, lower/upper
# triangular parts and vectorization.
pair.index <- elem.pairs(seq(n))
i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
# Matrix of vectorized indices. (vec(index) -> seq)
index <- matrix(seq(n * n), n, n)
lower <- index[lower.tri(index)]
upper <- t(index)[lower]
# Create all pairewise differences of rows of `X`.
X_diff <- X[i, , drop = F] - X[j, , drop = F]
# Identity matrix.
I_p <- diag(1, p)
# Init tracking of current best (according multiple attempts).
@ -53,7 +68,7 @@ cve_simple <- function(X, Y, k,
# Initial loss and gradient.
loss <- Inf
G <- grad(X, Y, V, h, loss.out = TRUE) # `loss.out=T` sets `loss`!
G <- grad(X, Y, V, h, loss.out = TRUE, persistent = TRUE)
# Set last loss (aka, loss after applying the step).
loss.last <- loss
@ -68,7 +83,7 @@ cve_simple <- function(X, Y, k,
V.tau <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V)
# Loss at position after a step.
loss <- grad(X, Y, V.tau, h, loss.only = TRUE)
loss <- grad(X, Y, V.tau, h, loss.only = TRUE, persistent = TRUE)
# Check if step is appropriate
if ((loss - loss.last) > slack * loss.last) {
@ -78,6 +93,11 @@ cve_simple <- function(X, Y, k,
# Compute error.
error <- norm(V %*% t(V) - V.tau %*% t(V.tau), type = "F")
# Write history.
loss.history[epoch, attempt] <- loss
error.history[epoch, attempt] <- error
# Check break condition (epoch check to skip ignored gradient calc).
# Note: the devision by `sqrt(2 * k)` is included in `tol`.
if (error < tol | epoch >= epochs) {
@ -91,8 +111,7 @@ cve_simple <- function(X, Y, k,
loss.last <- loss
# Compute gradient at new position.
# Note: `loss` will be updated too!
G <- grad(X, Y, V, h, loss.out = TRUE, loss.log = TRUE)
G <- grad(X, Y, V, h, persistent = TRUE)
# Cayley transform matrix `A`
A <- (G %*% t(V)) - (V %*% t(G))
@ -108,6 +127,7 @@ cve_simple <- function(X, Y, k,
return(list(
loss.history = loss.history,
error.history = error.history,
loss = loss.best,
V = V.best,
B = null(V.best),

View File

@ -4,38 +4,47 @@
#' @param Y Responce.
#' @param V Position to compute the gradient at, aka point on Stiefl manifold.
#' @param h Bandwidth
#' @param loss.out Iff \code{TRUE} loss will be written to parent environment.
#' @param loss.only Boolean to only compute the loss, of \code{TRUE} a single
#' value loss is returned and \code{envir} is ignored.
#' @param persistent Determines if data indices and dependent calculations shall
#' be reused from the parent environment. ATTENTION: Do NOT set this flag, only
#' intended for internal usage by carefully aligned functions!
#' @keywords internal
#' @export
grad <- function(X, Y, V, h, loss.out = FALSE, loss.log = FALSE, loss.only = FALSE) {
grad <- function(X, Y, V, h,
loss.out = FALSE,
loss.only = FALSE,
persistent = FALSE) {
# Get number of samples and dimension.
n <- nrow(X)
p <- ncol(X)
# Compute lookup indexes for symmetrie, lower/upper
# triangular parts and vectorization.
pair.index <- elem.pairs(seq(n))
i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
# Matrix of vectorized indices. (vec(index) -> seq)
index <- matrix(seq(n * n), n, n)
lower <- index[lower.tri(index)]
upper <- t(index)[lower]
if (!persistent) {
# Compute lookup indexes for symmetrie, lower/upper
# triangular parts and vectorization.
pair.index <- elem.pairs(seq(n))
i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
# Matrix of vectorized indices. (vec(index) -> seq)
index <- matrix(seq(n * n), n, n)
lower <- index[lower.tri(index)]
upper <- t.default(index)[lower]
# Create all pairewise differences of rows of `X`.
X_diff <- X[i, , drop = F] - X[j, , drop = F]
# Create all pairewise differences of rows of `X`.
X_diff <- X[i, , drop = F] - X[j, , drop = F]
}
# Projection matrix onto `span(V)`
Q <- diag(1, p) - (V %*% t(V))
Q <- diag(1, p) - tcrossprod(V, V)
# Vectorized distance matrix `D`.
vecD <- rowSums((X_diff %*% Q)^2)
# Weight matrix `W` (dnorm ... gaussean density function)
W <- matrix(dnorm(0), n, n)
W[lower] <- dnorm(vecD / h) # Set lower tri. part
W[upper] <- t(W)[upper] # Mirror lower tri. to upper
W <- matrix(1, n, n) # `exp(0) == 1`
W[lower] <- exp((-0.5 / h) * vecD^2) # Set lower tri. part
W[upper] <- t.default(W)[upper] # Mirror lower tri. to upper
W <- sweep(W, 2, colSums(W), FUN = `/`) # Col-Normalize
# Weighted `Y` momentums
@ -44,19 +53,11 @@ grad <- function(X, Y, V, h, loss.out = FALSE, loss.log = FALSE, loss.only = FAL
# Per example loss `L(V, X_i)`
L <- y2 - y1^2
if (loss.out | loss.log | loss.only) {
meanL <- mean(L)
if (loss.out) {
# Bubble environments up and write to loss variable, aka out param.
loss <<- meanL
}
if (loss.log) {
loss.history[epoch, attempt] <<- meanL
}
if (loss.only) {
# Mean for total loss `L(V)`.
return(meanL)
}
if (loss.only) {
return(mean(L))
}
if (loss.out) {
loss <<- mean(L)
}
# Vectorized Weights with forced symmetry
@ -66,7 +67,7 @@ grad <- function(X, Y, V, h, loss.out = FALSE, loss.log = FALSE, loss.only = FAL
vecS <- vecS * vecD
# The gradient.
G <- t(X_diff) %*% sweep(X_diff %*% V, 1, vecS, `*`)
G <- crossprod(X_diff, sweep(X_diff %*% V, 1, vecS, `*`))
G <- (-2 / (n * h^2)) * G
return(G)
}

View File

@ -0,0 +1,16 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/cve_linesearch.R
\name{cve_linesearch}
\alias{cve_linesearch}
\title{Implementation of the CVE method using curvilinear linesearch with Armijo-Wolfe
conditions.}
\usage{
cve_linesearch(X, Y, k, nObs = sqrt(nrow(X)), h = NULL, tau = 1,
tol = 0.001, rho1 = 0.1, rho2 = 0.9, slack = 0, epochs = 50L,
attempts = 10L, max.linesearch.iter = 10L)
}
\description{
Implementation of the CVE method using curvilinear linesearch with Armijo-Wolfe
conditions.
}
\keyword{internal}

View File

@ -6,7 +6,7 @@
a classic GD method unsing no further tricks.}
\usage{
cve_sgd(X, Y, k, nObs = sqrt(nrow(X)), h = NULL, tau = 0.01,
epochs = 50L, batch.size = 16L, attempts = 10L)
tol = 0.001, epochs = 50L, batch.size = 16L, attempts = 10L)
}
\description{
Simple implementation of the CVE method. 'Simple' means that this method is

View File

@ -4,8 +4,8 @@
\alias{grad}
\title{Compute get gradient of `L(V)` given a dataset `X`.}
\usage{
grad(X, Y, V, h, loss.out = FALSE, loss.log = FALSE,
loss.only = FALSE)
grad(X, Y, V, h, loss.out = FALSE, loss.only = FALSE,
persistent = FALSE)
}
\arguments{
\item{X}{Data matrix.}
@ -16,8 +16,14 @@ grad(X, Y, V, h, loss.out = FALSE, loss.log = FALSE,
\item{h}{Bandwidth}
\item{loss.out}{Iff \code{TRUE} loss will be written to parent environment.}
\item{loss.only}{Boolean to only compute the loss, of \code{TRUE} a single
value loss is returned and \code{envir} is ignored.}
\item{persistent}{Determines if data indices and dependent calculations shall
be reused from the parent environment. ATTENTION: Do NOT set this flag, only
intended for internal usage by carefully aligned functions!}
}
\description{
Compute get gradient of `L(V)` given a dataset `X`.

View File

@ -111,3 +111,89 @@ do.call(f, list(quote(A)), envir = env) # f.Global A.new
do.call("f", list(as.name("A")), envir = env) # f.new A.new
do.call("f", list(as.name("A")), envir = env) # f.new A.new
```
# Performance benchmarks
In this section alternative implementations of simple algorithms are compared for there performance.
### Computing the trace of a matrix multiplication.
```R
library(microbenchmark)
A <- matrix(runif(120), 12, 10)
# Matrix trace.
tr <- function(M) sum(diag(M))
# Check correctnes and benckmark performance.
stopifnot(
all.equal(
tr(t(A) %*% A),
sum(diag(t(A) %*% A)),
sum(A * A)
)
)
microbenchmark(
tr(t(A) %*% A),
sum(diag(t(A) %*% A)),
sum(A * A)
)
# Unit: nanoseconds
# expr min lq mean median uq max neval
# tr(t(A) %*% A) 4335 4713 5076.36 4949.5 5402.5 7928 100
# sum(diag(t(A) %*% A)) 4106 4429 5233.89 4733.5 5057.5 49308 100
# sum(A * A) 540 681 777.07 740.0 818.5 3572 100
```
```R
n <- 200
M <- matrix(runif(n^2), n, n)
dnorm2 <- function(x) exp(-0.5 * x^2) / sqrt(2 * pi)
stopifnot(
all.equal(dnorm(M), dnorm2(M))
)
microbenchmark(
dnorm = dnorm(M),
dnorm2 = dnorm2(M),
exp = exp(-0.5 * M^2) # without scaling -> irrelevant for usage
)
# Unit: microseconds
# expr min lq mean median uq max neval
# dnorm 841.503 843.811 920.7828 855.7505 912.4720 2405.587 100
# dnorm2 543.510 580.319 629.5321 597.8540 607.3795 2603.763 100
# exp 502.083 535.943 577.2884 548.3745 561.3280 2113.220 100
```
### Using `crosspord()`
```R
p <- 12
q <- 10
V <- matrix(runif(p * q), p, q)
stopifnot(
all.equal(V %*% t(V), tcrossprod(V)),
all.equal(V %*% t(V), tcrossprod(V, V))
)
microbenchmark(
V %*% t(V),
tcrossprod(V),
tcrossprod(V, V)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# V %*% t(V) 2.293 2.6335 2.94673 2.7375 2.9060 19.592 100
# tcrossprod(V) 1.148 1.2475 1.86173 1.3440 1.4650 30.688 100
# tcrossprod(V, V) 1.003 1.1575 1.28451 1.2400 1.3685 2.742 100
```
## Using `Rprof()` for performance.
The standart method for profiling where an algorithm is spending its time is with `Rprof()`.
```R
path <- '../tmp/R.prof' # path to profiling file
Rprof(path)
cve.res <- cve.call(X, Y, k = k)
Rprof(NULL)
(prof <- summaryRprof(path)) # Summarise results
```
**Note: considure to run `gc()` before measuring**, aka cleaning up by explicitely calling the garbage collector.

View File

@ -7,9 +7,11 @@ tell.user <- function(name, start.time, i, length) {
i, "/", length,
" - elapsed:", format(Sys.time() - start.time), "\033[K")
}
library(CVE) # load CVE
source("CVE_legacy/function_script.R") # Source legacy code
subspace.dist <- function(B1, B2){
P1 <- B1 %*% solve(t(B1) %*% B1) %*% t(B1)
P2 <- B2 %*% solve(t(B2) %*% B2) %*% t(B2)
return(norm(P1 - P2, type = 'F'))
}
# Number of simulations
SIM.NR <- 50
@ -20,8 +22,13 @@ ATTEMPTS <- 10
# set names of datasets
dataset.names <- c("M1", "M2", "M3", "M4", "M5")
# Set used CVE method
# methods <- c("legacy", "simple", "sgd")
methods <- c("legacy", "simple", "sgd")
methods <- c("simple") #, "sgd") # "legacy"
library(CVE) # load CVE
if ("legacy" %in% methods) {
# Source legacy code (but only if needed)
source("CVE_legacy/function_script.R")
}
# Setup error and time tracking variables
error <- matrix(NA, SIM.NR, length(methods) * length(dataset.names))
@ -79,7 +86,7 @@ for (sim in 1:SIM.NR) {
}
key <- paste0(name, '-', method)
error[sim, key] <- subspace_dist(dr$B, ds$B) / sqrt(2 * truedim)
error[sim, key] <- subspace.dist(dr$B, ds$B) / sqrt(2 * truedim)
time[sim, key] <- dr.time["elapsed"]
# Log results to file (mostly for long running simulations)