add: cve_linesearch,
fix: cleaned and optimized gradient, add. notes
This commit is contained in:
parent
1c120ec67c
commit
7d4d01a9a7
|
@ -6,6 +6,7 @@ export(col.pair.apply)
|
||||||
export(cve)
|
export(cve)
|
||||||
export(cve.call)
|
export(cve.call)
|
||||||
export(cve.grid.search)
|
export(cve.grid.search)
|
||||||
|
export(cve_linesearch)
|
||||||
export(cve_sgd)
|
export(cve_sgd)
|
||||||
export(cve_simple)
|
export(cve_simple)
|
||||||
export(dataset)
|
export(dataset)
|
||||||
|
|
|
@ -94,6 +94,8 @@ cve.call <- function(X, Y, method = "simple", nObs = nrow(X)^.5, k, ...) {
|
||||||
method <- tolower(method)
|
method <- tolower(method)
|
||||||
if (method == 'simple') {
|
if (method == 'simple') {
|
||||||
dr <- cve_simple(X, Y, k, nObs = nObs, ...)
|
dr <- cve_simple(X, Y, k, nObs = nObs, ...)
|
||||||
|
} else if (method == 'linesearch') {
|
||||||
|
dr <- cve_linesearch(X, Y, k, nObs = nObs, ...)
|
||||||
} else if (method == 'sgd') {
|
} else if (method == 'sgd') {
|
||||||
dr <- cve_sgd(X, Y, k, nObs = nObs, ...)
|
dr <- cve_sgd(X, Y, k, nObs = nObs, ...)
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -0,0 +1,163 @@
|
||||||
|
#' Implementation of the CVE method using curvilinear linesearch with Armijo-Wolfe
|
||||||
|
#' conditions.
|
||||||
|
#'
|
||||||
|
#' @keywords internal
|
||||||
|
#' @export
|
||||||
|
cve_linesearch <- function(X, Y, k,
|
||||||
|
nObs = sqrt(nrow(X)),
|
||||||
|
h = NULL,
|
||||||
|
tau = 1.0,
|
||||||
|
tol = 1e-3,
|
||||||
|
rho1 = 0.1,
|
||||||
|
rho2 = 0.9,
|
||||||
|
slack = 0,
|
||||||
|
epochs = 50L,
|
||||||
|
attempts = 10L,
|
||||||
|
max.linesearch.iter = 10L
|
||||||
|
) {
|
||||||
|
# Set `grad` functions environment to enable if to find this environments
|
||||||
|
# local variabels, needed to enable the manipulation of this local variables
|
||||||
|
# from within `grad`.
|
||||||
|
environment(grad) <- environment()
|
||||||
|
|
||||||
|
# Setup histories.
|
||||||
|
loss.history <- matrix(NA, epochs, attempts)
|
||||||
|
error.history <- matrix(NA, epochs, attempts)
|
||||||
|
tau.history <- matrix(NA, epochs, attempts)
|
||||||
|
|
||||||
|
# Get dimensions.
|
||||||
|
n <- nrow(X)
|
||||||
|
p <- ncol(X)
|
||||||
|
q <- p - k
|
||||||
|
|
||||||
|
# Save initial learning rate `tau`.
|
||||||
|
tau.init <- tau
|
||||||
|
# Addapt tolearance for break condition.
|
||||||
|
tol <- sqrt(2 * q) * tol
|
||||||
|
|
||||||
|
# Estaimate bandwidth if not given.
|
||||||
|
if (missing(h) | !is.numeric(h)) {
|
||||||
|
h <- estimate.bandwidth(X, k, nObs)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Compute persistent data.
|
||||||
|
# Compute lookup indexes for symmetrie, lower/upper
|
||||||
|
# triangular parts and vectorization.
|
||||||
|
pair.index <- elem.pairs(seq(n))
|
||||||
|
i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
|
||||||
|
j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
|
||||||
|
# Matrix of vectorized indices. (vec(index) -> seq)
|
||||||
|
index <- matrix(seq(n * n), n, n)
|
||||||
|
lower <- index[lower.tri(index)]
|
||||||
|
upper <- t(index)[lower]
|
||||||
|
|
||||||
|
# Create all pairewise differences of rows of `X`.
|
||||||
|
X_diff <- X[i, , drop = F] - X[j, , drop = F]
|
||||||
|
# Identity matrix.
|
||||||
|
I_p <- diag(1, p)
|
||||||
|
|
||||||
|
# Init tracking of current best (according multiple attempts).
|
||||||
|
V.best <- NULL
|
||||||
|
loss.best <- Inf
|
||||||
|
|
||||||
|
# Start loop for multiple attempts.
|
||||||
|
for (attempt in 1:attempts) {
|
||||||
|
|
||||||
|
# Sample a `(p, q)` dimensional matrix from the stiefel manifold as
|
||||||
|
# optimization start value.
|
||||||
|
V <- rStiefl(p, q)
|
||||||
|
|
||||||
|
# Initial loss and gradient.
|
||||||
|
loss <- Inf
|
||||||
|
G <- grad(X, Y, V, h, loss.out = TRUE, persistent = TRUE)
|
||||||
|
# Set last loss (aka, loss after applying the step).
|
||||||
|
loss.last <- loss
|
||||||
|
|
||||||
|
## Start optimization loop.
|
||||||
|
for (epoch in 1:epochs) {
|
||||||
|
|
||||||
|
# Cayley transform matrix `A`
|
||||||
|
A <- (G %*% t(V)) - (V %*% t(G))
|
||||||
|
|
||||||
|
# Directional derivative of the loss at current position, given
|
||||||
|
# as `Tr(G^T \cdot A \cdot V)`.
|
||||||
|
loss.prime <- -0.5 * norm(A, type = 'F')^2
|
||||||
|
|
||||||
|
# Linesearch
|
||||||
|
tau.upper <- Inf
|
||||||
|
tau.lower <- 0
|
||||||
|
tau <- tau.init
|
||||||
|
for (iter in 1:max.linesearch.iter) {
|
||||||
|
# Apply learning rate `tau`.
|
||||||
|
A.tau <- (tau / 2) * A
|
||||||
|
# Parallet transport (on Stiefl manifold) into direction of `G`.
|
||||||
|
inv <- solve(I_p + A.tau)
|
||||||
|
V.tau <- inv %*% ((I_p - A.tau) %*% V)
|
||||||
|
|
||||||
|
# Loss at position after a step.
|
||||||
|
loss <- Inf # aka loss.tau
|
||||||
|
G.tau <- grad(X, Y, V.tau, h, loss.out = TRUE, persistent = TRUE)
|
||||||
|
|
||||||
|
# Armijo condition.
|
||||||
|
if (loss > loss.last + (rho1 * tau * loss.prime)) {
|
||||||
|
tau.upper <- tau
|
||||||
|
tau <- (tau.lower + tau.upper) / 2
|
||||||
|
next()
|
||||||
|
}
|
||||||
|
|
||||||
|
V.prime.tau <- -0.5 * inv %*% A %*% (V + V.tau)
|
||||||
|
loss.prime.tau <- sum(G * V.prime.tau) # Tr(grad(tau)^T \cdot Y^'(tau))
|
||||||
|
|
||||||
|
# Wolfe condition.
|
||||||
|
if (loss.prime.tau < rho2 * loss.prime) {
|
||||||
|
tau.lower <- tau
|
||||||
|
if (tau.upper == Inf) {
|
||||||
|
tau <- 2 * tau.lower
|
||||||
|
} else {
|
||||||
|
tau <- (tau.lower + tau.upper) / 2
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
break()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Compute error.
|
||||||
|
error <- norm(V %*% t(V) - V.tau %*% t(V.tau), type = "F")
|
||||||
|
|
||||||
|
# Write history.
|
||||||
|
loss.history[epoch, attempt] <- loss
|
||||||
|
error.history[epoch, attempt] <- error
|
||||||
|
tau.history[epoch, attempt] <- tau
|
||||||
|
|
||||||
|
# Check break condition (epoch check to skip ignored gradient calc).
|
||||||
|
# Note: the devision by `sqrt(2 * k)` is included in `tol`.
|
||||||
|
if (error < tol | epoch >= epochs) {
|
||||||
|
# take last step and stop optimization.
|
||||||
|
V <- V.tau
|
||||||
|
break()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Perform the step and remember previous loss.
|
||||||
|
V <- V.tau
|
||||||
|
loss.last <- loss
|
||||||
|
G <- G.tau
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check if current attempt improved previous ones
|
||||||
|
if (loss < loss.best) {
|
||||||
|
loss.best <- loss
|
||||||
|
V.best <- V
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return(list(
|
||||||
|
loss.history = loss.history,
|
||||||
|
error.history = error.history,
|
||||||
|
tau.history = tau.history,
|
||||||
|
loss = loss.best,
|
||||||
|
V = V.best,
|
||||||
|
B = null(V.best),
|
||||||
|
h = h
|
||||||
|
))
|
||||||
|
}
|
|
@ -7,10 +7,20 @@ cve_sgd <- function(X, Y, k,
|
||||||
nObs = sqrt(nrow(X)),
|
nObs = sqrt(nrow(X)),
|
||||||
h = NULL,
|
h = NULL,
|
||||||
tau = 0.01,
|
tau = 0.01,
|
||||||
|
tol = 1e-3,
|
||||||
epochs = 50L,
|
epochs = 50L,
|
||||||
batch.size = 16L,
|
batch.size = 16L,
|
||||||
attempts = 10L
|
attempts = 10L
|
||||||
) {
|
) {
|
||||||
|
# Set `grad` functions environment to enable if to find this environments
|
||||||
|
# local variabels, needed to enable the manipulation of this local variables
|
||||||
|
# from within `grad`.
|
||||||
|
environment(grad) <- environment()
|
||||||
|
|
||||||
|
# Setup histories.
|
||||||
|
loss.history <- matrix(NA, epochs, attempts)
|
||||||
|
error.history <- matrix(NA, epochs, attempts)
|
||||||
|
|
||||||
# Get dimensions.
|
# Get dimensions.
|
||||||
n <- nrow(X) # Number of samples.
|
n <- nrow(X) # Number of samples.
|
||||||
p <- ncol(X) # Data dimensions
|
p <- ncol(X) # Data dimensions
|
||||||
|
@ -18,15 +28,31 @@ cve_sgd <- function(X, Y, k,
|
||||||
|
|
||||||
# Save initial learning rate `tau`.
|
# Save initial learning rate `tau`.
|
||||||
tau.init <- tau
|
tau.init <- tau
|
||||||
|
# Addapt tolearance for break condition.
|
||||||
|
tol <- sqrt(2 * q) * tol
|
||||||
|
|
||||||
# Estaimate bandwidth if not given.
|
# Estaimate bandwidth if not given.
|
||||||
if (missing(h) | !is.numeric(h)) {
|
if (missing(h) | !is.numeric(h)) {
|
||||||
h <- estimate.bandwidth(X, k, nObs)
|
h <- estimate.bandwidth(X, k, nObs)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Compute persistent data.
|
||||||
|
# Compute lookup indexes for symmetrie, lower/upper
|
||||||
|
# triangular parts and vectorization.
|
||||||
|
pair.index <- elem.pairs(seq(n))
|
||||||
|
i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
|
||||||
|
j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
|
||||||
|
# Matrix of vectorized indices. (vec(index) -> seq)
|
||||||
|
index <- matrix(seq(n * n), n, n)
|
||||||
|
lower <- index[lower.tri(index)]
|
||||||
|
upper <- t(index)[lower]
|
||||||
|
|
||||||
|
# Create all pairewise differences of rows of `X`.
|
||||||
|
X_diff <- X[i, , drop = F] - X[j, , drop = F]
|
||||||
|
# Identity matrix.
|
||||||
|
I_p <- diag(1, p)
|
||||||
# Init a list of data indices (shuffled for batching).
|
# Init a list of data indices (shuffled for batching).
|
||||||
indices <- seq(n)
|
indices <- seq(n)
|
||||||
I_p <- diag(1, p)
|
|
||||||
|
|
||||||
# Init tracking of current best (according multiple attempts).
|
# Init tracking of current best (according multiple attempts).
|
||||||
V.best <- NULL
|
V.best <- NULL
|
||||||
|
@ -40,6 +66,8 @@ cve_sgd <- function(X, Y, k,
|
||||||
# Sample a `(p, q)` dimensional matrix from the stiefel manifold as
|
# Sample a `(p, q)` dimensional matrix from the stiefel manifold as
|
||||||
# optimization start value.
|
# optimization start value.
|
||||||
V <- rStiefl(p, q)
|
V <- rStiefl(p, q)
|
||||||
|
# Keep track of last `V` for computing error after an epoch.
|
||||||
|
V.last <- V
|
||||||
|
|
||||||
# Repeat `epochs` times
|
# Repeat `epochs` times
|
||||||
for (epoch in 1:epochs) {
|
for (epoch in 1:epochs) {
|
||||||
|
@ -55,7 +83,7 @@ cve_sgd <- function(X, Y, k,
|
||||||
|
|
||||||
# Compute batch gradient.
|
# Compute batch gradient.
|
||||||
loss <- NULL
|
loss <- NULL
|
||||||
G <- grad(X[batch, ], Y[batch], V, h)
|
G <- grad(X[batch, ], Y[batch], V, h, loss.out = TRUE)
|
||||||
|
|
||||||
# Cayley transform matrix.
|
# Cayley transform matrix.
|
||||||
A <- (G %*% t(V)) - (V %*% t(G))
|
A <- (G %*% t(V)) - (V %*% t(G))
|
||||||
|
@ -65,27 +93,34 @@ cve_sgd <- function(X, Y, k,
|
||||||
# Parallet transport (on Stiefl manifold) into direction of `G`.
|
# Parallet transport (on Stiefl manifold) into direction of `G`.
|
||||||
V <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V)
|
V <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V)
|
||||||
}
|
}
|
||||||
}
|
# Compute actuall loss after finishing optimization.
|
||||||
# Compute actuall loss after finishing optimization.
|
loss <- grad(X, Y, V, h, loss.only = TRUE, persistent = TRUE)
|
||||||
loss <- grad(X, Y, V, h, loss.only = TRUE)
|
# And the error for the history.
|
||||||
# After each attempt, check if last attempt reached a better result.
|
error <- norm(V.last %*% t(V.last) - V %*% t(V), type = "F")
|
||||||
if (!is.null(V.best)) { # Only required if there is already a result.
|
V.last <- V
|
||||||
if (loss < loss.best) {
|
|
||||||
loss.best <- loss
|
# Finaly write history.
|
||||||
V.best <- V
|
loss.history[epoch, attempt] <- loss
|
||||||
|
error.history[epoch, attempt] <- error
|
||||||
|
|
||||||
|
# Check break condition.
|
||||||
|
if (error < tol) {
|
||||||
|
break()
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
# After each attempt, check if last attempt reached a better result.
|
||||||
|
if (loss < loss.best) {
|
||||||
loss.best <- loss
|
loss.best <- loss
|
||||||
V.best <- V
|
V.best <- V
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return(list(
|
return(list(
|
||||||
X = X, Y = Y, k = k,
|
loss.history = loss.history,
|
||||||
nObs = nObs, h = h, tau = tau,
|
error.history = error.history,
|
||||||
epochs = epochs, batch = batch, attempts = attempts,
|
|
||||||
loss = loss.best,
|
loss = loss.best,
|
||||||
V = V.best,
|
V = V.best,
|
||||||
B = null(V.best)
|
B = null(V.best),
|
||||||
|
h = h
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,8 +17,9 @@ cve_simple <- function(X, Y, k,
|
||||||
# from within `grad`.
|
# from within `grad`.
|
||||||
environment(grad) <- environment()
|
environment(grad) <- environment()
|
||||||
|
|
||||||
# Setup loss histroy.
|
# Setup histories.
|
||||||
loss.history <- matrix(NA, epochs, attempts);
|
loss.history <- matrix(NA, epochs, attempts)
|
||||||
|
error.history <- matrix(NA, epochs, attempts)
|
||||||
|
|
||||||
# Get dimensions.
|
# Get dimensions.
|
||||||
n <- nrow(X)
|
n <- nrow(X)
|
||||||
|
@ -35,6 +36,20 @@ cve_simple <- function(X, Y, k,
|
||||||
h <- estimate.bandwidth(X, k, nObs)
|
h <- estimate.bandwidth(X, k, nObs)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Compute persistent data.
|
||||||
|
# Compute lookup indexes for symmetrie, lower/upper
|
||||||
|
# triangular parts and vectorization.
|
||||||
|
pair.index <- elem.pairs(seq(n))
|
||||||
|
i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
|
||||||
|
j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
|
||||||
|
# Matrix of vectorized indices. (vec(index) -> seq)
|
||||||
|
index <- matrix(seq(n * n), n, n)
|
||||||
|
lower <- index[lower.tri(index)]
|
||||||
|
upper <- t(index)[lower]
|
||||||
|
|
||||||
|
# Create all pairewise differences of rows of `X`.
|
||||||
|
X_diff <- X[i, , drop = F] - X[j, , drop = F]
|
||||||
|
# Identity matrix.
|
||||||
I_p <- diag(1, p)
|
I_p <- diag(1, p)
|
||||||
|
|
||||||
# Init tracking of current best (according multiple attempts).
|
# Init tracking of current best (according multiple attempts).
|
||||||
|
@ -53,7 +68,7 @@ cve_simple <- function(X, Y, k,
|
||||||
|
|
||||||
# Initial loss and gradient.
|
# Initial loss and gradient.
|
||||||
loss <- Inf
|
loss <- Inf
|
||||||
G <- grad(X, Y, V, h, loss.out = TRUE) # `loss.out=T` sets `loss`!
|
G <- grad(X, Y, V, h, loss.out = TRUE, persistent = TRUE)
|
||||||
# Set last loss (aka, loss after applying the step).
|
# Set last loss (aka, loss after applying the step).
|
||||||
loss.last <- loss
|
loss.last <- loss
|
||||||
|
|
||||||
|
@ -68,7 +83,7 @@ cve_simple <- function(X, Y, k,
|
||||||
V.tau <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V)
|
V.tau <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V)
|
||||||
|
|
||||||
# Loss at position after a step.
|
# Loss at position after a step.
|
||||||
loss <- grad(X, Y, V.tau, h, loss.only = TRUE)
|
loss <- grad(X, Y, V.tau, h, loss.only = TRUE, persistent = TRUE)
|
||||||
|
|
||||||
# Check if step is appropriate
|
# Check if step is appropriate
|
||||||
if ((loss - loss.last) > slack * loss.last) {
|
if ((loss - loss.last) > slack * loss.last) {
|
||||||
|
@ -78,6 +93,11 @@ cve_simple <- function(X, Y, k,
|
||||||
|
|
||||||
# Compute error.
|
# Compute error.
|
||||||
error <- norm(V %*% t(V) - V.tau %*% t(V.tau), type = "F")
|
error <- norm(V %*% t(V) - V.tau %*% t(V.tau), type = "F")
|
||||||
|
|
||||||
|
# Write history.
|
||||||
|
loss.history[epoch, attempt] <- loss
|
||||||
|
error.history[epoch, attempt] <- error
|
||||||
|
|
||||||
# Check break condition (epoch check to skip ignored gradient calc).
|
# Check break condition (epoch check to skip ignored gradient calc).
|
||||||
# Note: the devision by `sqrt(2 * k)` is included in `tol`.
|
# Note: the devision by `sqrt(2 * k)` is included in `tol`.
|
||||||
if (error < tol | epoch >= epochs) {
|
if (error < tol | epoch >= epochs) {
|
||||||
|
@ -91,8 +111,7 @@ cve_simple <- function(X, Y, k,
|
||||||
loss.last <- loss
|
loss.last <- loss
|
||||||
|
|
||||||
# Compute gradient at new position.
|
# Compute gradient at new position.
|
||||||
# Note: `loss` will be updated too!
|
G <- grad(X, Y, V, h, persistent = TRUE)
|
||||||
G <- grad(X, Y, V, h, loss.out = TRUE, loss.log = TRUE)
|
|
||||||
|
|
||||||
# Cayley transform matrix `A`
|
# Cayley transform matrix `A`
|
||||||
A <- (G %*% t(V)) - (V %*% t(G))
|
A <- (G %*% t(V)) - (V %*% t(G))
|
||||||
|
@ -108,6 +127,7 @@ cve_simple <- function(X, Y, k,
|
||||||
|
|
||||||
return(list(
|
return(list(
|
||||||
loss.history = loss.history,
|
loss.history = loss.history,
|
||||||
|
error.history = error.history,
|
||||||
loss = loss.best,
|
loss = loss.best,
|
||||||
V = V.best,
|
V = V.best,
|
||||||
B = null(V.best),
|
B = null(V.best),
|
||||||
|
|
|
@ -4,38 +4,47 @@
|
||||||
#' @param Y Responce.
|
#' @param Y Responce.
|
||||||
#' @param V Position to compute the gradient at, aka point on Stiefl manifold.
|
#' @param V Position to compute the gradient at, aka point on Stiefl manifold.
|
||||||
#' @param h Bandwidth
|
#' @param h Bandwidth
|
||||||
|
#' @param loss.out Iff \code{TRUE} loss will be written to parent environment.
|
||||||
#' @param loss.only Boolean to only compute the loss, of \code{TRUE} a single
|
#' @param loss.only Boolean to only compute the loss, of \code{TRUE} a single
|
||||||
#' value loss is returned and \code{envir} is ignored.
|
#' value loss is returned and \code{envir} is ignored.
|
||||||
|
#' @param persistent Determines if data indices and dependent calculations shall
|
||||||
|
#' be reused from the parent environment. ATTENTION: Do NOT set this flag, only
|
||||||
|
#' intended for internal usage by carefully aligned functions!
|
||||||
#' @keywords internal
|
#' @keywords internal
|
||||||
#' @export
|
#' @export
|
||||||
grad <- function(X, Y, V, h, loss.out = FALSE, loss.log = FALSE, loss.only = FALSE) {
|
grad <- function(X, Y, V, h,
|
||||||
|
loss.out = FALSE,
|
||||||
|
loss.only = FALSE,
|
||||||
|
persistent = FALSE) {
|
||||||
# Get number of samples and dimension.
|
# Get number of samples and dimension.
|
||||||
n <- nrow(X)
|
n <- nrow(X)
|
||||||
p <- ncol(X)
|
p <- ncol(X)
|
||||||
|
|
||||||
# Compute lookup indexes for symmetrie, lower/upper
|
if (!persistent) {
|
||||||
# triangular parts and vectorization.
|
# Compute lookup indexes for symmetrie, lower/upper
|
||||||
pair.index <- elem.pairs(seq(n))
|
# triangular parts and vectorization.
|
||||||
i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
|
pair.index <- elem.pairs(seq(n))
|
||||||
j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
|
i <- pair.index[, 1] # `i` indices of `(i, j)` pairs
|
||||||
# Matrix of vectorized indices. (vec(index) -> seq)
|
j <- pair.index[, 2] # `j` indices of `(i, j)` pairs
|
||||||
index <- matrix(seq(n * n), n, n)
|
# Matrix of vectorized indices. (vec(index) -> seq)
|
||||||
lower <- index[lower.tri(index)]
|
index <- matrix(seq(n * n), n, n)
|
||||||
upper <- t(index)[lower]
|
lower <- index[lower.tri(index)]
|
||||||
|
upper <- t.default(index)[lower]
|
||||||
|
|
||||||
# Create all pairewise differences of rows of `X`.
|
# Create all pairewise differences of rows of `X`.
|
||||||
X_diff <- X[i, , drop = F] - X[j, , drop = F]
|
X_diff <- X[i, , drop = F] - X[j, , drop = F]
|
||||||
|
}
|
||||||
|
|
||||||
# Projection matrix onto `span(V)`
|
# Projection matrix onto `span(V)`
|
||||||
Q <- diag(1, p) - (V %*% t(V))
|
Q <- diag(1, p) - tcrossprod(V, V)
|
||||||
|
|
||||||
# Vectorized distance matrix `D`.
|
# Vectorized distance matrix `D`.
|
||||||
vecD <- rowSums((X_diff %*% Q)^2)
|
vecD <- rowSums((X_diff %*% Q)^2)
|
||||||
|
|
||||||
# Weight matrix `W` (dnorm ... gaussean density function)
|
# Weight matrix `W` (dnorm ... gaussean density function)
|
||||||
W <- matrix(dnorm(0), n, n)
|
W <- matrix(1, n, n) # `exp(0) == 1`
|
||||||
W[lower] <- dnorm(vecD / h) # Set lower tri. part
|
W[lower] <- exp((-0.5 / h) * vecD^2) # Set lower tri. part
|
||||||
W[upper] <- t(W)[upper] # Mirror lower tri. to upper
|
W[upper] <- t.default(W)[upper] # Mirror lower tri. to upper
|
||||||
W <- sweep(W, 2, colSums(W), FUN = `/`) # Col-Normalize
|
W <- sweep(W, 2, colSums(W), FUN = `/`) # Col-Normalize
|
||||||
|
|
||||||
# Weighted `Y` momentums
|
# Weighted `Y` momentums
|
||||||
|
@ -44,19 +53,11 @@ grad <- function(X, Y, V, h, loss.out = FALSE, loss.log = FALSE, loss.only = FAL
|
||||||
|
|
||||||
# Per example loss `L(V, X_i)`
|
# Per example loss `L(V, X_i)`
|
||||||
L <- y2 - y1^2
|
L <- y2 - y1^2
|
||||||
if (loss.out | loss.log | loss.only) {
|
if (loss.only) {
|
||||||
meanL <- mean(L)
|
return(mean(L))
|
||||||
if (loss.out) {
|
}
|
||||||
# Bubble environments up and write to loss variable, aka out param.
|
if (loss.out) {
|
||||||
loss <<- meanL
|
loss <<- mean(L)
|
||||||
}
|
|
||||||
if (loss.log) {
|
|
||||||
loss.history[epoch, attempt] <<- meanL
|
|
||||||
}
|
|
||||||
if (loss.only) {
|
|
||||||
# Mean for total loss `L(V)`.
|
|
||||||
return(meanL)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Vectorized Weights with forced symmetry
|
# Vectorized Weights with forced symmetry
|
||||||
|
@ -66,7 +67,7 @@ grad <- function(X, Y, V, h, loss.out = FALSE, loss.log = FALSE, loss.only = FAL
|
||||||
vecS <- vecS * vecD
|
vecS <- vecS * vecD
|
||||||
|
|
||||||
# The gradient.
|
# The gradient.
|
||||||
G <- t(X_diff) %*% sweep(X_diff %*% V, 1, vecS, `*`)
|
G <- crossprod(X_diff, sweep(X_diff %*% V, 1, vecS, `*`))
|
||||||
G <- (-2 / (n * h^2)) * G
|
G <- (-2 / (n * h^2)) * G
|
||||||
return(G)
|
return(G)
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/cve_linesearch.R
|
||||||
|
\name{cve_linesearch}
|
||||||
|
\alias{cve_linesearch}
|
||||||
|
\title{Implementation of the CVE method using curvilinear linesearch with Armijo-Wolfe
|
||||||
|
conditions.}
|
||||||
|
\usage{
|
||||||
|
cve_linesearch(X, Y, k, nObs = sqrt(nrow(X)), h = NULL, tau = 1,
|
||||||
|
tol = 0.001, rho1 = 0.1, rho2 = 0.9, slack = 0, epochs = 50L,
|
||||||
|
attempts = 10L, max.linesearch.iter = 10L)
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Implementation of the CVE method using curvilinear linesearch with Armijo-Wolfe
|
||||||
|
conditions.
|
||||||
|
}
|
||||||
|
\keyword{internal}
|
|
@ -6,7 +6,7 @@
|
||||||
a classic GD method unsing no further tricks.}
|
a classic GD method unsing no further tricks.}
|
||||||
\usage{
|
\usage{
|
||||||
cve_sgd(X, Y, k, nObs = sqrt(nrow(X)), h = NULL, tau = 0.01,
|
cve_sgd(X, Y, k, nObs = sqrt(nrow(X)), h = NULL, tau = 0.01,
|
||||||
epochs = 50L, batch.size = 16L, attempts = 10L)
|
tol = 0.001, epochs = 50L, batch.size = 16L, attempts = 10L)
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Simple implementation of the CVE method. 'Simple' means that this method is
|
Simple implementation of the CVE method. 'Simple' means that this method is
|
||||||
|
|
|
@ -4,8 +4,8 @@
|
||||||
\alias{grad}
|
\alias{grad}
|
||||||
\title{Compute get gradient of `L(V)` given a dataset `X`.}
|
\title{Compute get gradient of `L(V)` given a dataset `X`.}
|
||||||
\usage{
|
\usage{
|
||||||
grad(X, Y, V, h, loss.out = FALSE, loss.log = FALSE,
|
grad(X, Y, V, h, loss.out = FALSE, loss.only = FALSE,
|
||||||
loss.only = FALSE)
|
persistent = FALSE)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{X}{Data matrix.}
|
\item{X}{Data matrix.}
|
||||||
|
@ -16,8 +16,14 @@ grad(X, Y, V, h, loss.out = FALSE, loss.log = FALSE,
|
||||||
|
|
||||||
\item{h}{Bandwidth}
|
\item{h}{Bandwidth}
|
||||||
|
|
||||||
|
\item{loss.out}{Iff \code{TRUE} loss will be written to parent environment.}
|
||||||
|
|
||||||
\item{loss.only}{Boolean to only compute the loss, of \code{TRUE} a single
|
\item{loss.only}{Boolean to only compute the loss, of \code{TRUE} a single
|
||||||
value loss is returned and \code{envir} is ignored.}
|
value loss is returned and \code{envir} is ignored.}
|
||||||
|
|
||||||
|
\item{persistent}{Determines if data indices and dependent calculations shall
|
||||||
|
be reused from the parent environment. ATTENTION: Do NOT set this flag, only
|
||||||
|
intended for internal usage by carefully aligned functions!}
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Compute get gradient of `L(V)` given a dataset `X`.
|
Compute get gradient of `L(V)` given a dataset `X`.
|
||||||
|
|
86
notes.md
86
notes.md
|
@ -111,3 +111,89 @@ do.call(f, list(quote(A)), envir = env) # f.Global A.new
|
||||||
do.call("f", list(as.name("A")), envir = env) # f.new A.new
|
do.call("f", list(as.name("A")), envir = env) # f.new A.new
|
||||||
do.call("f", list(as.name("A")), envir = env) # f.new A.new
|
do.call("f", list(as.name("A")), envir = env) # f.new A.new
|
||||||
```
|
```
|
||||||
|
|
||||||
|
# Performance benchmarks
|
||||||
|
In this section alternative implementations of simple algorithms are compared for there performance.
|
||||||
|
|
||||||
|
### Computing the trace of a matrix multiplication.
|
||||||
|
```R
|
||||||
|
library(microbenchmark)
|
||||||
|
|
||||||
|
A <- matrix(runif(120), 12, 10)
|
||||||
|
|
||||||
|
# Matrix trace.
|
||||||
|
tr <- function(M) sum(diag(M))
|
||||||
|
|
||||||
|
# Check correctnes and benckmark performance.
|
||||||
|
stopifnot(
|
||||||
|
all.equal(
|
||||||
|
tr(t(A) %*% A),
|
||||||
|
sum(diag(t(A) %*% A)),
|
||||||
|
sum(A * A)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
microbenchmark(
|
||||||
|
tr(t(A) %*% A),
|
||||||
|
sum(diag(t(A) %*% A)),
|
||||||
|
sum(A * A)
|
||||||
|
)
|
||||||
|
# Unit: nanoseconds
|
||||||
|
# expr min lq mean median uq max neval
|
||||||
|
# tr(t(A) %*% A) 4335 4713 5076.36 4949.5 5402.5 7928 100
|
||||||
|
# sum(diag(t(A) %*% A)) 4106 4429 5233.89 4733.5 5057.5 49308 100
|
||||||
|
# sum(A * A) 540 681 777.07 740.0 818.5 3572 100
|
||||||
|
```
|
||||||
|
|
||||||
|
```R
|
||||||
|
n <- 200
|
||||||
|
M <- matrix(runif(n^2), n, n)
|
||||||
|
|
||||||
|
dnorm2 <- function(x) exp(-0.5 * x^2) / sqrt(2 * pi)
|
||||||
|
|
||||||
|
stopifnot(
|
||||||
|
all.equal(dnorm(M), dnorm2(M))
|
||||||
|
)
|
||||||
|
microbenchmark(
|
||||||
|
dnorm = dnorm(M),
|
||||||
|
dnorm2 = dnorm2(M),
|
||||||
|
exp = exp(-0.5 * M^2) # without scaling -> irrelevant for usage
|
||||||
|
)
|
||||||
|
# Unit: microseconds
|
||||||
|
# expr min lq mean median uq max neval
|
||||||
|
# dnorm 841.503 843.811 920.7828 855.7505 912.4720 2405.587 100
|
||||||
|
# dnorm2 543.510 580.319 629.5321 597.8540 607.3795 2603.763 100
|
||||||
|
# exp 502.083 535.943 577.2884 548.3745 561.3280 2113.220 100
|
||||||
|
```
|
||||||
|
|
||||||
|
### Using `crosspord()`
|
||||||
|
```R
|
||||||
|
p <- 12
|
||||||
|
q <- 10
|
||||||
|
V <- matrix(runif(p * q), p, q)
|
||||||
|
|
||||||
|
stopifnot(
|
||||||
|
all.equal(V %*% t(V), tcrossprod(V)),
|
||||||
|
all.equal(V %*% t(V), tcrossprod(V, V))
|
||||||
|
)
|
||||||
|
microbenchmark(
|
||||||
|
V %*% t(V),
|
||||||
|
tcrossprod(V),
|
||||||
|
tcrossprod(V, V)
|
||||||
|
)
|
||||||
|
# Unit: microseconds
|
||||||
|
# expr min lq mean median uq max neval
|
||||||
|
# V %*% t(V) 2.293 2.6335 2.94673 2.7375 2.9060 19.592 100
|
||||||
|
# tcrossprod(V) 1.148 1.2475 1.86173 1.3440 1.4650 30.688 100
|
||||||
|
# tcrossprod(V, V) 1.003 1.1575 1.28451 1.2400 1.3685 2.742 100
|
||||||
|
```
|
||||||
|
|
||||||
|
## Using `Rprof()` for performance.
|
||||||
|
The standart method for profiling where an algorithm is spending its time is with `Rprof()`.
|
||||||
|
```R
|
||||||
|
path <- '../tmp/R.prof' # path to profiling file
|
||||||
|
Rprof(path)
|
||||||
|
cve.res <- cve.call(X, Y, k = k)
|
||||||
|
Rprof(NULL)
|
||||||
|
(prof <- summaryRprof(path)) # Summarise results
|
||||||
|
```
|
||||||
|
**Note: considure to run `gc()` before measuring**, aka cleaning up by explicitely calling the garbage collector.
|
||||||
|
|
|
@ -7,9 +7,11 @@ tell.user <- function(name, start.time, i, length) {
|
||||||
i, "/", length,
|
i, "/", length,
|
||||||
" - elapsed:", format(Sys.time() - start.time), "\033[K")
|
" - elapsed:", format(Sys.time() - start.time), "\033[K")
|
||||||
}
|
}
|
||||||
|
subspace.dist <- function(B1, B2){
|
||||||
library(CVE) # load CVE
|
P1 <- B1 %*% solve(t(B1) %*% B1) %*% t(B1)
|
||||||
source("CVE_legacy/function_script.R") # Source legacy code
|
P2 <- B2 %*% solve(t(B2) %*% B2) %*% t(B2)
|
||||||
|
return(norm(P1 - P2, type = 'F'))
|
||||||
|
}
|
||||||
|
|
||||||
# Number of simulations
|
# Number of simulations
|
||||||
SIM.NR <- 50
|
SIM.NR <- 50
|
||||||
|
@ -20,8 +22,13 @@ ATTEMPTS <- 10
|
||||||
# set names of datasets
|
# set names of datasets
|
||||||
dataset.names <- c("M1", "M2", "M3", "M4", "M5")
|
dataset.names <- c("M1", "M2", "M3", "M4", "M5")
|
||||||
# Set used CVE method
|
# Set used CVE method
|
||||||
# methods <- c("legacy", "simple", "sgd")
|
methods <- c("simple") #, "sgd") # "legacy"
|
||||||
methods <- c("legacy", "simple", "sgd")
|
|
||||||
|
library(CVE) # load CVE
|
||||||
|
if ("legacy" %in% methods) {
|
||||||
|
# Source legacy code (but only if needed)
|
||||||
|
source("CVE_legacy/function_script.R")
|
||||||
|
}
|
||||||
|
|
||||||
# Setup error and time tracking variables
|
# Setup error and time tracking variables
|
||||||
error <- matrix(NA, SIM.NR, length(methods) * length(dataset.names))
|
error <- matrix(NA, SIM.NR, length(methods) * length(dataset.names))
|
||||||
|
@ -79,7 +86,7 @@ for (sim in 1:SIM.NR) {
|
||||||
}
|
}
|
||||||
|
|
||||||
key <- paste0(name, '-', method)
|
key <- paste0(name, '-', method)
|
||||||
error[sim, key] <- subspace_dist(dr$B, ds$B) / sqrt(2 * truedim)
|
error[sim, key] <- subspace.dist(dr$B, ds$B) / sqrt(2 * truedim)
|
||||||
time[sim, key] <- dr.time["elapsed"]
|
time[sim, key] <- dr.time["elapsed"]
|
||||||
|
|
||||||
# Log results to file (mostly for long running simulations)
|
# Log results to file (mostly for long running simulations)
|
||||||
|
|
Loading…
Reference in New Issue