add: cve_linesearch,
fix: cleaned and optimized gradient, add. notes
This commit is contained in:
		
							parent
							
								
									1c120ec67c
								
							
						
					
					
						commit
						7d4d01a9a7
					
				| @ -6,6 +6,7 @@ export(col.pair.apply) | |||||||
| export(cve) | export(cve) | ||||||
| export(cve.call) | export(cve.call) | ||||||
| export(cve.grid.search) | export(cve.grid.search) | ||||||
|  | export(cve_linesearch) | ||||||
| export(cve_sgd) | export(cve_sgd) | ||||||
| export(cve_simple) | export(cve_simple) | ||||||
| export(dataset) | export(dataset) | ||||||
|  | |||||||
| @ -94,6 +94,8 @@ cve.call <- function(X, Y, method = "simple", nObs = nrow(X)^.5, k, ...) { | |||||||
|     method <- tolower(method) |     method <- tolower(method) | ||||||
|     if (method == 'simple') { |     if (method == 'simple') { | ||||||
|         dr <- cve_simple(X, Y, k, nObs = nObs, ...) |         dr <- cve_simple(X, Y, k, nObs = nObs, ...) | ||||||
|  |     } else if (method == 'linesearch') { | ||||||
|  |         dr <- cve_linesearch(X, Y, k, nObs = nObs, ...) | ||||||
|     } else if (method == 'sgd') { |     } else if (method == 'sgd') { | ||||||
|         dr <- cve_sgd(X, Y, k, nObs = nObs, ...) |         dr <- cve_sgd(X, Y, k, nObs = nObs, ...) | ||||||
|     } else { |     } else { | ||||||
|  | |||||||
							
								
								
									
										163
									
								
								CVE_R/R/cve_linesearch.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										163
									
								
								CVE_R/R/cve_linesearch.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,163 @@ | |||||||
|  | #' Implementation of the CVE method using curvilinear linesearch with Armijo-Wolfe | ||||||
|  | #' conditions. | ||||||
|  | #' | ||||||
|  | #' @keywords internal | ||||||
|  | #' @export | ||||||
|  | cve_linesearch <- function(X, Y, k, | ||||||
|  |                            nObs = sqrt(nrow(X)), | ||||||
|  |                            h = NULL, | ||||||
|  |                            tau = 1.0, | ||||||
|  |                            tol = 1e-3, | ||||||
|  |                            rho1 = 0.1, | ||||||
|  |                            rho2 = 0.9, | ||||||
|  |                            slack = 0, | ||||||
|  |                            epochs = 50L, | ||||||
|  |                            attempts = 10L, | ||||||
|  |                            max.linesearch.iter = 10L | ||||||
|  | ) { | ||||||
|  |     # Set `grad` functions environment to enable if to find this environments | ||||||
|  |     # local variabels, needed to enable the manipulation of this local variables | ||||||
|  |     # from within `grad`. | ||||||
|  |     environment(grad) <- environment() | ||||||
|  | 
 | ||||||
|  |     # Setup histories. | ||||||
|  |     loss.history <- matrix(NA, epochs, attempts) | ||||||
|  |     error.history <- matrix(NA, epochs, attempts) | ||||||
|  |     tau.history <- matrix(NA, epochs, attempts) | ||||||
|  | 
 | ||||||
|  |     # Get dimensions. | ||||||
|  |     n <- nrow(X) | ||||||
|  |     p <- ncol(X) | ||||||
|  |     q <- p - k | ||||||
|  | 
 | ||||||
|  |     # Save initial learning rate `tau`. | ||||||
|  |     tau.init <- tau | ||||||
|  |     # Addapt tolearance for break condition. | ||||||
|  |     tol <- sqrt(2 * q) * tol | ||||||
|  | 
 | ||||||
|  |     # Estaimate bandwidth if not given. | ||||||
|  |     if (missing(h) | !is.numeric(h)) { | ||||||
|  |         h <- estimate.bandwidth(X, k, nObs) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     # Compute persistent data. | ||||||
|  |     # Compute lookup indexes for symmetrie, lower/upper | ||||||
|  |     # triangular parts and vectorization. | ||||||
|  |     pair.index <- elem.pairs(seq(n)) | ||||||
|  |     i <- pair.index[, 1] # `i` indices of `(i, j)` pairs | ||||||
|  |     j <- pair.index[, 2] # `j` indices of `(i, j)` pairs | ||||||
|  |     # Matrix of vectorized indices. (vec(index) -> seq) | ||||||
|  |     index <- matrix(seq(n * n), n, n) | ||||||
|  |     lower <- index[lower.tri(index)] | ||||||
|  |     upper <- t(index)[lower] | ||||||
|  | 
 | ||||||
|  |     # Create all pairewise differences of rows of `X`. | ||||||
|  |     X_diff <- X[i, , drop = F] - X[j, , drop = F] | ||||||
|  |     # Identity matrix. | ||||||
|  |     I_p <- diag(1, p) | ||||||
|  | 
 | ||||||
|  |     # Init tracking of current best (according multiple attempts). | ||||||
|  |     V.best <- NULL | ||||||
|  |     loss.best <- Inf | ||||||
|  | 
 | ||||||
|  |     # Start loop for multiple attempts. | ||||||
|  |     for (attempt in 1:attempts) { | ||||||
|  | 
 | ||||||
|  |         # Sample a `(p, q)` dimensional matrix from the stiefel manifold as | ||||||
|  |         # optimization start value. | ||||||
|  |         V <- rStiefl(p, q) | ||||||
|  | 
 | ||||||
|  |         # Initial loss and gradient. | ||||||
|  |         loss <- Inf | ||||||
|  |         G <- grad(X, Y, V, h, loss.out = TRUE, persistent = TRUE) | ||||||
|  |         # Set last loss (aka, loss after applying the step). | ||||||
|  |         loss.last <- loss | ||||||
|  | 
 | ||||||
|  |         ## Start optimization loop. | ||||||
|  |         for (epoch in 1:epochs) { | ||||||
|  | 
 | ||||||
|  |             # Cayley transform matrix `A` | ||||||
|  |             A <- (G %*% t(V)) - (V %*% t(G)) | ||||||
|  | 
 | ||||||
|  |             # Directional derivative of the loss at current position, given | ||||||
|  |             # as `Tr(G^T \cdot A \cdot V)`. | ||||||
|  |             loss.prime <- -0.5 * norm(A, type = 'F')^2 | ||||||
|  | 
 | ||||||
|  |             # Linesearch | ||||||
|  |             tau.upper <- Inf | ||||||
|  |             tau.lower <- 0 | ||||||
|  |             tau <- tau.init | ||||||
|  |             for (iter in 1:max.linesearch.iter) { | ||||||
|  |                 # Apply learning rate `tau`. | ||||||
|  |                 A.tau <- (tau / 2) * A | ||||||
|  |                 # Parallet transport (on Stiefl manifold) into direction of `G`. | ||||||
|  |                 inv <- solve(I_p + A.tau) | ||||||
|  |                 V.tau <- inv %*% ((I_p - A.tau) %*% V) | ||||||
|  | 
 | ||||||
|  |                 # Loss at position after a step. | ||||||
|  |                 loss <- Inf # aka loss.tau | ||||||
|  |                 G.tau <- grad(X, Y, V.tau, h, loss.out = TRUE, persistent = TRUE) | ||||||
|  | 
 | ||||||
|  |                 # Armijo condition. | ||||||
|  |                 if (loss > loss.last + (rho1 * tau * loss.prime)) { | ||||||
|  |                     tau.upper <- tau | ||||||
|  |                     tau <- (tau.lower + tau.upper) / 2 | ||||||
|  |                     next() | ||||||
|  |                 } | ||||||
|  | 
 | ||||||
|  |                 V.prime.tau <- -0.5 * inv %*% A %*% (V + V.tau) | ||||||
|  |                 loss.prime.tau <- sum(G * V.prime.tau) # Tr(grad(tau)^T \cdot Y^'(tau)) | ||||||
|  | 
 | ||||||
|  |                 # Wolfe condition. | ||||||
|  |                 if (loss.prime.tau < rho2 * loss.prime) { | ||||||
|  |                     tau.lower <- tau | ||||||
|  |                     if (tau.upper == Inf) { | ||||||
|  |                         tau <- 2 * tau.lower | ||||||
|  |                     } else { | ||||||
|  |                         tau <- (tau.lower + tau.upper) / 2 | ||||||
|  |                     } | ||||||
|  |                 } else { | ||||||
|  |                     break() | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|  |             # Compute error. | ||||||
|  |             error <- norm(V %*% t(V) - V.tau %*% t(V.tau), type = "F") | ||||||
|  | 
 | ||||||
|  |             # Write history. | ||||||
|  |             loss.history[epoch, attempt] <- loss | ||||||
|  |             error.history[epoch, attempt] <- error | ||||||
|  |             tau.history[epoch, attempt] <- tau | ||||||
|  | 
 | ||||||
|  |             # Check break condition (epoch check to skip ignored gradient calc). | ||||||
|  |             # Note: the devision by `sqrt(2 * k)` is included in `tol`. | ||||||
|  |             if (error < tol | epoch >= epochs) { | ||||||
|  |                 # take last step and stop optimization. | ||||||
|  |                 V <- V.tau | ||||||
|  |                 break() | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|  |             # Perform the step and remember previous loss. | ||||||
|  |             V <- V.tau | ||||||
|  |             loss.last <- loss | ||||||
|  |             G <- G.tau | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         # Check if current attempt improved previous ones | ||||||
|  |         if (loss < loss.best) { | ||||||
|  |             loss.best <- loss | ||||||
|  |             V.best <- V | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     return(list( | ||||||
|  |         loss.history = loss.history, | ||||||
|  |         error.history = error.history, | ||||||
|  |         tau.history = tau.history, | ||||||
|  |         loss = loss.best, | ||||||
|  |         V = V.best, | ||||||
|  |         B = null(V.best), | ||||||
|  |         h = h | ||||||
|  |     )) | ||||||
|  | } | ||||||
| @ -7,10 +7,20 @@ cve_sgd <- function(X, Y, k, | |||||||
|                     nObs = sqrt(nrow(X)), |                     nObs = sqrt(nrow(X)), | ||||||
|                     h = NULL, |                     h = NULL, | ||||||
|                     tau = 0.01, |                     tau = 0.01, | ||||||
|  |                     tol = 1e-3, | ||||||
|                     epochs = 50L, |                     epochs = 50L, | ||||||
|                     batch.size = 16L, |                     batch.size = 16L, | ||||||
|                     attempts = 10L |                     attempts = 10L | ||||||
| ) { | ) { | ||||||
|  |     # Set `grad` functions environment to enable if to find this environments | ||||||
|  |     # local variabels, needed to enable the manipulation of this local variables | ||||||
|  |     # from within `grad`. | ||||||
|  |     environment(grad) <- environment() | ||||||
|  | 
 | ||||||
|  |     # Setup histories. | ||||||
|  |     loss.history <- matrix(NA, epochs, attempts) | ||||||
|  |     error.history <- matrix(NA, epochs, attempts) | ||||||
|  | 
 | ||||||
|     # Get dimensions. |     # Get dimensions. | ||||||
|     n <- nrow(X) # Number of samples. |     n <- nrow(X) # Number of samples. | ||||||
|     p <- ncol(X) # Data dimensions |     p <- ncol(X) # Data dimensions | ||||||
| @ -18,15 +28,31 @@ cve_sgd <- function(X, Y, k, | |||||||
| 
 | 
 | ||||||
|     # Save initial learning rate `tau`. |     # Save initial learning rate `tau`. | ||||||
|     tau.init <- tau |     tau.init <- tau | ||||||
|  |     # Addapt tolearance for break condition. | ||||||
|  |     tol <- sqrt(2 * q) * tol | ||||||
| 
 | 
 | ||||||
|     # Estaimate bandwidth if not given. |     # Estaimate bandwidth if not given. | ||||||
|     if (missing(h) | !is.numeric(h)) { |     if (missing(h) | !is.numeric(h)) { | ||||||
|         h <- estimate.bandwidth(X, k, nObs) |         h <- estimate.bandwidth(X, k, nObs) | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     # Compute persistent data. | ||||||
|  |     # Compute lookup indexes for symmetrie, lower/upper | ||||||
|  |     # triangular parts and vectorization. | ||||||
|  |     pair.index <- elem.pairs(seq(n)) | ||||||
|  |     i <- pair.index[, 1] # `i` indices of `(i, j)` pairs | ||||||
|  |     j <- pair.index[, 2] # `j` indices of `(i, j)` pairs | ||||||
|  |     # Matrix of vectorized indices. (vec(index) -> seq) | ||||||
|  |     index <- matrix(seq(n * n), n, n) | ||||||
|  |     lower <- index[lower.tri(index)] | ||||||
|  |     upper <- t(index)[lower] | ||||||
|  | 
 | ||||||
|  |     # Create all pairewise differences of rows of `X`. | ||||||
|  |     X_diff <- X[i, , drop = F] - X[j, , drop = F] | ||||||
|  |     # Identity matrix. | ||||||
|  |     I_p <- diag(1, p) | ||||||
|     # Init a list of data indices (shuffled for batching). |     # Init a list of data indices (shuffled for batching). | ||||||
|     indices <- seq(n) |     indices <- seq(n) | ||||||
|     I_p <- diag(1, p) |  | ||||||
| 
 | 
 | ||||||
|     # Init tracking of current best (according multiple attempts). |     # Init tracking of current best (according multiple attempts). | ||||||
|     V.best <- NULL |     V.best <- NULL | ||||||
| @ -40,6 +66,8 @@ cve_sgd <- function(X, Y, k, | |||||||
|         # Sample a `(p, q)` dimensional matrix from the stiefel manifold as |         # Sample a `(p, q)` dimensional matrix from the stiefel manifold as | ||||||
|         # optimization start value. |         # optimization start value. | ||||||
|         V <- rStiefl(p, q) |         V <- rStiefl(p, q) | ||||||
|  |         # Keep track of last `V` for computing error after an epoch. | ||||||
|  |         V.last <- V | ||||||
| 
 | 
 | ||||||
|         # Repeat `epochs` times |         # Repeat `epochs` times | ||||||
|         for (epoch in 1:epochs) { |         for (epoch in 1:epochs) { | ||||||
| @ -55,7 +83,7 @@ cve_sgd <- function(X, Y, k, | |||||||
| 
 | 
 | ||||||
|                 # Compute batch gradient. |                 # Compute batch gradient. | ||||||
|                 loss <- NULL |                 loss <- NULL | ||||||
|                 G <- grad(X[batch, ], Y[batch], V, h) |                 G <- grad(X[batch, ], Y[batch], V, h, loss.out = TRUE) | ||||||
| 
 | 
 | ||||||
|                 # Cayley transform matrix. |                 # Cayley transform matrix. | ||||||
|                 A <- (G %*% t(V)) - (V %*% t(G)) |                 A <- (G %*% t(V)) - (V %*% t(G)) | ||||||
| @ -65,27 +93,34 @@ cve_sgd <- function(X, Y, k, | |||||||
|                 # Parallet transport (on Stiefl manifold) into direction of `G`. |                 # Parallet transport (on Stiefl manifold) into direction of `G`. | ||||||
|                 V <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V) |                 V <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V) | ||||||
|             } |             } | ||||||
|         } |             # Compute actuall loss after finishing optimization. | ||||||
|         # Compute actuall loss after finishing optimization. |             loss <- grad(X, Y, V, h, loss.only = TRUE, persistent = TRUE) | ||||||
|         loss <- grad(X, Y, V, h, loss.only = TRUE) |             # And the error for the history. | ||||||
|         # After each attempt, check if last attempt reached a better result. |             error <- norm(V.last %*% t(V.last) - V %*% t(V), type = "F") | ||||||
|         if (!is.null(V.best)) { # Only required if there is already a result. |             V.last <- V | ||||||
|             if (loss < loss.best) { | 
 | ||||||
|                 loss.best <- loss |             # Finaly write history. | ||||||
|                 V.best <- V |             loss.history[epoch, attempt] <- loss | ||||||
|  |             error.history[epoch, attempt] <- error | ||||||
|  | 
 | ||||||
|  |             # Check break condition. | ||||||
|  |             if (error < tol) { | ||||||
|  |                 break() | ||||||
|             } |             } | ||||||
|         } else { |         } | ||||||
|  |         # After each attempt, check if last attempt reached a better result. | ||||||
|  |         if (loss < loss.best) { | ||||||
|             loss.best <- loss |             loss.best <- loss | ||||||
|             V.best <- V |             V.best <- V | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     return(list( |     return(list( | ||||||
|         X = X, Y = Y, k = k, |         loss.history = loss.history, | ||||||
|         nObs = nObs, h = h, tau = tau, |         error.history = error.history, | ||||||
|         epochs = epochs, batch = batch, attempts = attempts, |  | ||||||
|         loss = loss.best, |         loss = loss.best, | ||||||
|         V = V.best, |         V = V.best, | ||||||
|         B = null(V.best) |         B = null(V.best), | ||||||
|  |         h = h | ||||||
|     )) |     )) | ||||||
| } | } | ||||||
|  | |||||||
| @ -17,8 +17,9 @@ cve_simple <- function(X, Y, k, | |||||||
|     # from within `grad`. |     # from within `grad`. | ||||||
|     environment(grad) <- environment() |     environment(grad) <- environment() | ||||||
| 
 | 
 | ||||||
|     # Setup loss histroy. |     # Setup histories. | ||||||
|     loss.history <- matrix(NA, epochs, attempts); |     loss.history <- matrix(NA, epochs, attempts) | ||||||
|  |     error.history <- matrix(NA, epochs, attempts) | ||||||
| 
 | 
 | ||||||
|     # Get dimensions. |     # Get dimensions. | ||||||
|     n <- nrow(X) |     n <- nrow(X) | ||||||
| @ -35,6 +36,20 @@ cve_simple <- function(X, Y, k, | |||||||
|         h <- estimate.bandwidth(X, k, nObs) |         h <- estimate.bandwidth(X, k, nObs) | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     # Compute persistent data. | ||||||
|  |     # Compute lookup indexes for symmetrie, lower/upper | ||||||
|  |     # triangular parts and vectorization. | ||||||
|  |     pair.index <- elem.pairs(seq(n)) | ||||||
|  |     i <- pair.index[, 1] # `i` indices of `(i, j)` pairs | ||||||
|  |     j <- pair.index[, 2] # `j` indices of `(i, j)` pairs | ||||||
|  |     # Matrix of vectorized indices. (vec(index) -> seq) | ||||||
|  |     index <- matrix(seq(n * n), n, n) | ||||||
|  |     lower <- index[lower.tri(index)] | ||||||
|  |     upper <- t(index)[lower] | ||||||
|  | 
 | ||||||
|  |     # Create all pairewise differences of rows of `X`. | ||||||
|  |     X_diff <- X[i, , drop = F] - X[j, , drop = F] | ||||||
|  |     # Identity matrix. | ||||||
|     I_p <- diag(1, p) |     I_p <- diag(1, p) | ||||||
| 
 | 
 | ||||||
|     # Init tracking of current best (according multiple attempts). |     # Init tracking of current best (according multiple attempts). | ||||||
| @ -53,7 +68,7 @@ cve_simple <- function(X, Y, k, | |||||||
| 
 | 
 | ||||||
|         # Initial loss and gradient. |         # Initial loss and gradient. | ||||||
|         loss <- Inf |         loss <- Inf | ||||||
|         G <- grad(X, Y, V, h, loss.out = TRUE) # `loss.out=T` sets `loss`! |         G <- grad(X, Y, V, h, loss.out = TRUE, persistent = TRUE) | ||||||
|         # Set last loss (aka, loss after applying the step). |         # Set last loss (aka, loss after applying the step). | ||||||
|         loss.last <- loss |         loss.last <- loss | ||||||
| 
 | 
 | ||||||
| @ -68,7 +83,7 @@ cve_simple <- function(X, Y, k, | |||||||
|             V.tau <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V) |             V.tau <- solve(I_p + A.tau) %*% ((I_p - A.tau) %*% V) | ||||||
| 
 | 
 | ||||||
|             # Loss at position after a step. |             # Loss at position after a step. | ||||||
|             loss <- grad(X, Y, V.tau, h, loss.only = TRUE) |             loss <- grad(X, Y, V.tau, h, loss.only = TRUE, persistent = TRUE) | ||||||
| 
 | 
 | ||||||
|             # Check if step is appropriate |             # Check if step is appropriate | ||||||
|             if ((loss - loss.last) > slack * loss.last) { |             if ((loss - loss.last) > slack * loss.last) { | ||||||
| @ -78,6 +93,11 @@ cve_simple <- function(X, Y, k, | |||||||
| 
 | 
 | ||||||
|             # Compute error. |             # Compute error. | ||||||
|             error <- norm(V %*% t(V) - V.tau %*% t(V.tau), type = "F") |             error <- norm(V %*% t(V) - V.tau %*% t(V.tau), type = "F") | ||||||
|  | 
 | ||||||
|  |             # Write history. | ||||||
|  |             loss.history[epoch, attempt] <- loss | ||||||
|  |             error.history[epoch, attempt] <- error | ||||||
|  | 
 | ||||||
|             # Check break condition (epoch check to skip ignored gradient calc). |             # Check break condition (epoch check to skip ignored gradient calc). | ||||||
|             # Note: the devision by `sqrt(2 * k)` is included in `tol`. |             # Note: the devision by `sqrt(2 * k)` is included in `tol`. | ||||||
|             if (error < tol | epoch >= epochs) { |             if (error < tol | epoch >= epochs) { | ||||||
| @ -91,8 +111,7 @@ cve_simple <- function(X, Y, k, | |||||||
|             loss.last <- loss |             loss.last <- loss | ||||||
| 
 | 
 | ||||||
|             # Compute gradient at new position. |             # Compute gradient at new position. | ||||||
|             # Note: `loss` will be updated too! |             G <- grad(X, Y, V, h, persistent = TRUE) | ||||||
|             G <- grad(X, Y, V, h, loss.out = TRUE, loss.log = TRUE) |  | ||||||
| 
 | 
 | ||||||
|             # Cayley transform matrix `A` |             # Cayley transform matrix `A` | ||||||
|             A <- (G %*% t(V)) - (V %*% t(G)) |             A <- (G %*% t(V)) - (V %*% t(G)) | ||||||
| @ -108,6 +127,7 @@ cve_simple <- function(X, Y, k, | |||||||
| 
 | 
 | ||||||
|     return(list( |     return(list( | ||||||
|         loss.history = loss.history, |         loss.history = loss.history, | ||||||
|  |         error.history = error.history, | ||||||
|         loss = loss.best, |         loss = loss.best, | ||||||
|         V = V.best, |         V = V.best, | ||||||
|         B = null(V.best), |         B = null(V.best), | ||||||
|  | |||||||
| @ -4,38 +4,47 @@ | |||||||
| #' @param Y Responce. | #' @param Y Responce. | ||||||
| #' @param V Position to compute the gradient at, aka point on Stiefl manifold. | #' @param V Position to compute the gradient at, aka point on Stiefl manifold. | ||||||
| #' @param h Bandwidth | #' @param h Bandwidth | ||||||
|  | #' @param loss.out Iff \code{TRUE} loss will be written to parent environment. | ||||||
| #' @param loss.only Boolean to only compute the loss, of \code{TRUE} a single | #' @param loss.only Boolean to only compute the loss, of \code{TRUE} a single | ||||||
| #'  value loss is returned and \code{envir} is ignored. | #'  value loss is returned and \code{envir} is ignored. | ||||||
|  | #' @param persistent Determines if data indices and dependent calculations shall | ||||||
|  | #'  be reused from the parent environment. ATTENTION: Do NOT set this flag, only | ||||||
|  | #'  intended for internal usage by carefully aligned functions! | ||||||
| #' @keywords internal | #' @keywords internal | ||||||
| #' @export | #' @export | ||||||
| grad <- function(X, Y, V, h, loss.out = FALSE, loss.log = FALSE, loss.only = FALSE) { | grad <- function(X, Y, V, h, | ||||||
|  |                  loss.out = FALSE, | ||||||
|  |                  loss.only = FALSE, | ||||||
|  |                  persistent = FALSE) { | ||||||
|     # Get number of samples and dimension. |     # Get number of samples and dimension. | ||||||
|     n <- nrow(X) |     n <- nrow(X) | ||||||
|     p <- ncol(X) |     p <- ncol(X) | ||||||
| 
 | 
 | ||||||
|     # Compute lookup indexes for symmetrie, lower/upper |     if (!persistent) { | ||||||
|     # triangular parts and vectorization. |         # Compute lookup indexes for symmetrie, lower/upper | ||||||
|     pair.index <- elem.pairs(seq(n)) |         # triangular parts and vectorization. | ||||||
|     i <- pair.index[, 1] # `i` indices of `(i, j)` pairs |         pair.index <- elem.pairs(seq(n)) | ||||||
|     j <- pair.index[, 2] # `j` indices of `(i, j)` pairs |         i <- pair.index[, 1] # `i` indices of `(i, j)` pairs | ||||||
|     # Matrix of vectorized indices. (vec(index) -> seq) |         j <- pair.index[, 2] # `j` indices of `(i, j)` pairs | ||||||
|     index <- matrix(seq(n * n), n, n) |         # Matrix of vectorized indices. (vec(index) -> seq) | ||||||
|     lower <- index[lower.tri(index)] |         index <- matrix(seq(n * n), n, n) | ||||||
|     upper <- t(index)[lower] |         lower <- index[lower.tri(index)] | ||||||
|  |         upper <- t.default(index)[lower] | ||||||
| 
 | 
 | ||||||
|     # Create all pairewise differences of rows of `X`. |         # Create all pairewise differences of rows of `X`. | ||||||
|     X_diff <- X[i, , drop = F] - X[j, , drop = F] |         X_diff <- X[i, , drop = F] - X[j, , drop = F] | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     # Projection matrix onto `span(V)` |     # Projection matrix onto `span(V)` | ||||||
|     Q <- diag(1, p) - (V %*% t(V)) |     Q <- diag(1, p) - tcrossprod(V, V) | ||||||
| 
 | 
 | ||||||
|     # Vectorized distance matrix `D`. |     # Vectorized distance matrix `D`. | ||||||
|     vecD <- rowSums((X_diff %*% Q)^2) |     vecD <- rowSums((X_diff %*% Q)^2) | ||||||
| 
 | 
 | ||||||
|     # Weight matrix `W` (dnorm ... gaussean density function) |     # Weight matrix `W` (dnorm ... gaussean density function) | ||||||
|     W <- matrix(dnorm(0), n, n) |     W <- matrix(1, n, n) # `exp(0) == 1` | ||||||
|     W[lower] <- dnorm(vecD / h) # Set lower tri. part |     W[lower] <- exp((-0.5 / h) * vecD^2) # Set lower tri. part | ||||||
|     W[upper] <- t(W)[upper] # Mirror lower tri. to upper |     W[upper] <- t.default(W)[upper] # Mirror lower tri. to upper | ||||||
|     W <- sweep(W, 2, colSums(W), FUN = `/`) # Col-Normalize |     W <- sweep(W, 2, colSums(W), FUN = `/`) # Col-Normalize | ||||||
| 
 | 
 | ||||||
|     # Weighted `Y` momentums |     # Weighted `Y` momentums | ||||||
| @ -44,19 +53,11 @@ grad <- function(X, Y, V, h, loss.out = FALSE, loss.log = FALSE, loss.only = FAL | |||||||
| 
 | 
 | ||||||
|     # Per example loss `L(V, X_i)` |     # Per example loss `L(V, X_i)` | ||||||
|     L <- y2 - y1^2 |     L <- y2 - y1^2 | ||||||
|     if (loss.out | loss.log | loss.only) { |     if (loss.only) { | ||||||
|         meanL <- mean(L) |         return(mean(L)) | ||||||
|         if (loss.out) { |     } | ||||||
|             # Bubble environments up and write to loss variable, aka out param. |     if (loss.out) { | ||||||
|             loss <<- meanL |         loss <<- mean(L) | ||||||
|         } |  | ||||||
|         if (loss.log) { |  | ||||||
|             loss.history[epoch, attempt] <<- meanL |  | ||||||
|         } |  | ||||||
|         if (loss.only) { |  | ||||||
|             # Mean for total loss `L(V)`. |  | ||||||
|             return(meanL) |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     # Vectorized Weights with forced symmetry |     # Vectorized Weights with forced symmetry | ||||||
| @ -66,7 +67,7 @@ grad <- function(X, Y, V, h, loss.out = FALSE, loss.log = FALSE, loss.only = FAL | |||||||
|     vecS <- vecS * vecD |     vecS <- vecS * vecD | ||||||
| 
 | 
 | ||||||
|     # The gradient. |     # The gradient. | ||||||
|     G <- t(X_diff) %*% sweep(X_diff %*% V, 1, vecS, `*`) |     G <- crossprod(X_diff, sweep(X_diff %*% V, 1, vecS, `*`)) | ||||||
|     G <- (-2 / (n * h^2)) * G |     G <- (-2 / (n * h^2)) * G | ||||||
|     return(G) |     return(G) | ||||||
| } | } | ||||||
|  | |||||||
							
								
								
									
										16
									
								
								CVE_R/man/cve_linesearch.Rd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								CVE_R/man/cve_linesearch.Rd
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,16 @@ | |||||||
|  | % Generated by roxygen2: do not edit by hand | ||||||
|  | % Please edit documentation in R/cve_linesearch.R | ||||||
|  | \name{cve_linesearch} | ||||||
|  | \alias{cve_linesearch} | ||||||
|  | \title{Implementation of the CVE method using curvilinear linesearch with Armijo-Wolfe | ||||||
|  | conditions.} | ||||||
|  | \usage{ | ||||||
|  | cve_linesearch(X, Y, k, nObs = sqrt(nrow(X)), h = NULL, tau = 1, | ||||||
|  |   tol = 0.001, rho1 = 0.1, rho2 = 0.9, slack = 0, epochs = 50L, | ||||||
|  |   attempts = 10L, max.linesearch.iter = 10L) | ||||||
|  | } | ||||||
|  | \description{ | ||||||
|  | Implementation of the CVE method using curvilinear linesearch with Armijo-Wolfe | ||||||
|  | conditions. | ||||||
|  | } | ||||||
|  | \keyword{internal} | ||||||
| @ -6,7 +6,7 @@ | |||||||
| a classic GD method unsing no further tricks.} | a classic GD method unsing no further tricks.} | ||||||
| \usage{ | \usage{ | ||||||
| cve_sgd(X, Y, k, nObs = sqrt(nrow(X)), h = NULL, tau = 0.01, | cve_sgd(X, Y, k, nObs = sqrt(nrow(X)), h = NULL, tau = 0.01, | ||||||
|   epochs = 50L, batch.size = 16L, attempts = 10L) |   tol = 0.001, epochs = 50L, batch.size = 16L, attempts = 10L) | ||||||
| } | } | ||||||
| \description{ | \description{ | ||||||
| Simple implementation of the CVE method. 'Simple' means that this method is | Simple implementation of the CVE method. 'Simple' means that this method is | ||||||
|  | |||||||
| @ -4,8 +4,8 @@ | |||||||
| \alias{grad} | \alias{grad} | ||||||
| \title{Compute get gradient of `L(V)` given a dataset `X`.} | \title{Compute get gradient of `L(V)` given a dataset `X`.} | ||||||
| \usage{ | \usage{ | ||||||
| grad(X, Y, V, h, loss.out = FALSE, loss.log = FALSE, | grad(X, Y, V, h, loss.out = FALSE, loss.only = FALSE, | ||||||
|   loss.only = FALSE) |   persistent = FALSE) | ||||||
| } | } | ||||||
| \arguments{ | \arguments{ | ||||||
| \item{X}{Data matrix.} | \item{X}{Data matrix.} | ||||||
| @ -16,8 +16,14 @@ grad(X, Y, V, h, loss.out = FALSE, loss.log = FALSE, | |||||||
| 
 | 
 | ||||||
| \item{h}{Bandwidth} | \item{h}{Bandwidth} | ||||||
| 
 | 
 | ||||||
|  | \item{loss.out}{Iff \code{TRUE} loss will be written to parent environment.} | ||||||
|  | 
 | ||||||
| \item{loss.only}{Boolean to only compute the loss, of \code{TRUE} a single | \item{loss.only}{Boolean to only compute the loss, of \code{TRUE} a single | ||||||
| value loss is returned and \code{envir} is ignored.} | value loss is returned and \code{envir} is ignored.} | ||||||
|  | 
 | ||||||
|  | \item{persistent}{Determines if data indices and dependent calculations shall | ||||||
|  | be reused from the parent environment. ATTENTION: Do NOT set this flag, only | ||||||
|  | intended for internal usage by carefully aligned functions!} | ||||||
| } | } | ||||||
| \description{ | \description{ | ||||||
| Compute get gradient of `L(V)` given a dataset `X`. | Compute get gradient of `L(V)` given a dataset `X`. | ||||||
|  | |||||||
							
								
								
									
										86
									
								
								notes.md
									
									
									
									
									
								
							
							
						
						
									
										86
									
								
								notes.md
									
									
									
									
									
								
							| @ -111,3 +111,89 @@ do.call(f,   list(quote(A)), envir = env)     # f.Global A.new | |||||||
| do.call("f", list(as.name("A")), envir = env) # f.new A.new | do.call("f", list(as.name("A")), envir = env) # f.new A.new | ||||||
| do.call("f", list(as.name("A")), envir = env) # f.new A.new | do.call("f", list(as.name("A")), envir = env) # f.new A.new | ||||||
| ``` | ``` | ||||||
|  | 
 | ||||||
|  | # Performance benchmarks | ||||||
|  | In this section alternative implementations of simple algorithms are compared for there performance. | ||||||
|  | 
 | ||||||
|  | ### Computing the trace of a matrix multiplication. | ||||||
|  | ```R | ||||||
|  | library(microbenchmark) | ||||||
|  | 
 | ||||||
|  | A <- matrix(runif(120), 12, 10) | ||||||
|  | 
 | ||||||
|  | # Matrix trace. | ||||||
|  | tr <- function(M) sum(diag(M)) | ||||||
|  | 
 | ||||||
|  | # Check correctnes and benckmark performance. | ||||||
|  | stopifnot( | ||||||
|  |     all.equal( | ||||||
|  |         tr(t(A) %*% A), | ||||||
|  |         sum(diag(t(A) %*% A)), | ||||||
|  |         sum(A * A) | ||||||
|  |     ) | ||||||
|  | ) | ||||||
|  | microbenchmark( | ||||||
|  |     tr(t(A) %*% A), | ||||||
|  |     sum(diag(t(A) %*% A)), | ||||||
|  |     sum(A * A) | ||||||
|  | ) | ||||||
|  | # Unit: nanoseconds | ||||||
|  | #                   expr  min   lq    mean median     uq   max neval | ||||||
|  | #         tr(t(A) %*% A) 4335 4713 5076.36 4949.5 5402.5  7928   100 | ||||||
|  | #  sum(diag(t(A) %*% A)) 4106 4429 5233.89 4733.5 5057.5 49308   100 | ||||||
|  | #             sum(A * A)  540  681  777.07  740.0  818.5  3572   100 | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | ```R | ||||||
|  | n <- 200 | ||||||
|  | M <- matrix(runif(n^2), n, n) | ||||||
|  | 
 | ||||||
|  | dnorm2 <- function(x) exp(-0.5 * x^2) / sqrt(2 * pi) | ||||||
|  | 
 | ||||||
|  | stopifnot( | ||||||
|  |     all.equal(dnorm(M), dnorm2(M)) | ||||||
|  | ) | ||||||
|  | microbenchmark( | ||||||
|  |     dnorm = dnorm(M), | ||||||
|  |     dnorm2 = dnorm2(M), | ||||||
|  |     exp = exp(-0.5 * M^2) # without scaling -> irrelevant for usage | ||||||
|  | ) | ||||||
|  | # Unit: microseconds | ||||||
|  | #   expr     min      lq     mean   median       uq      max neval | ||||||
|  | #  dnorm 841.503 843.811 920.7828 855.7505 912.4720 2405.587   100 | ||||||
|  | # dnorm2 543.510 580.319 629.5321 597.8540 607.3795 2603.763   100 | ||||||
|  | #    exp 502.083 535.943 577.2884 548.3745 561.3280 2113.220   100 | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | ### Using `crosspord()` | ||||||
|  | ```R | ||||||
|  | p <- 12 | ||||||
|  | q <- 10 | ||||||
|  | V <- matrix(runif(p * q), p, q) | ||||||
|  | 
 | ||||||
|  | stopifnot( | ||||||
|  |     all.equal(V %*% t(V), tcrossprod(V)), | ||||||
|  |     all.equal(V %*% t(V), tcrossprod(V, V)) | ||||||
|  | ) | ||||||
|  | microbenchmark( | ||||||
|  |     V %*% t(V), | ||||||
|  |     tcrossprod(V), | ||||||
|  |     tcrossprod(V, V) | ||||||
|  | ) | ||||||
|  | # Unit: microseconds | ||||||
|  | #              expr   min     lq    mean median     uq    max neval | ||||||
|  | #        V %*% t(V) 2.293 2.6335 2.94673 2.7375 2.9060 19.592   100 | ||||||
|  | #     tcrossprod(V) 1.148 1.2475 1.86173 1.3440 1.4650 30.688   100 | ||||||
|  | #  tcrossprod(V, V) 1.003 1.1575 1.28451 1.2400 1.3685  2.742   100 | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | ## Using `Rprof()` for performance. | ||||||
|  | The standart method for profiling where an algorithm is spending its time is with `Rprof()`. | ||||||
|  | ```R | ||||||
|  | path <- '../tmp/R.prof' # path to profiling file | ||||||
|  | Rprof(path) | ||||||
|  | cve.res <- cve.call(X, Y, k = k) | ||||||
|  | Rprof(NULL) | ||||||
|  | (prof <- summaryRprof(path)) # Summarise results | ||||||
|  | ``` | ||||||
|  | **Note: considure to run `gc()` before measuring**, aka cleaning up by explicitely calling the garbage collector. | ||||||
|  | |||||||
| @ -7,9 +7,11 @@ tell.user <- function(name, start.time, i, length) { | |||||||
|         i, "/", length, |         i, "/", length, | ||||||
|         " - elapsed:", format(Sys.time() - start.time), "\033[K") |         " - elapsed:", format(Sys.time() - start.time), "\033[K") | ||||||
| } | } | ||||||
| 
 | subspace.dist <- function(B1, B2){ | ||||||
| library(CVE) # load CVE |     P1 <- B1 %*% solve(t(B1) %*% B1) %*% t(B1) | ||||||
| source("CVE_legacy/function_script.R") # Source legacy code |     P2 <- B2 %*% solve(t(B2) %*% B2) %*% t(B2) | ||||||
|  |     return(norm(P1 - P2, type = 'F')) | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| # Number of simulations | # Number of simulations | ||||||
| SIM.NR <- 50 | SIM.NR <- 50 | ||||||
| @ -20,8 +22,13 @@ ATTEMPTS <- 10 | |||||||
| # set names of datasets | # set names of datasets | ||||||
| dataset.names <- c("M1", "M2", "M3", "M4", "M5") | dataset.names <- c("M1", "M2", "M3", "M4", "M5") | ||||||
| # Set used CVE method | # Set used CVE method | ||||||
| # methods <- c("legacy", "simple", "sgd") | methods <- c("simple") #, "sgd") # "legacy" | ||||||
| methods <- c("legacy", "simple", "sgd") | 
 | ||||||
|  | library(CVE) # load CVE | ||||||
|  | if ("legacy" %in% methods) { | ||||||
|  |     # Source legacy code (but only if needed) | ||||||
|  |     source("CVE_legacy/function_script.R") | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| # Setup error and time tracking variables | # Setup error and time tracking variables | ||||||
| error <- matrix(NA, SIM.NR, length(methods) * length(dataset.names)) | error <- matrix(NA, SIM.NR, length(methods) * length(dataset.names)) | ||||||
| @ -79,7 +86,7 @@ for (sim in 1:SIM.NR) { | |||||||
|             } |             } | ||||||
| 
 | 
 | ||||||
|             key <- paste0(name, '-', method) |             key <- paste0(name, '-', method) | ||||||
|             error[sim, key] <- subspace_dist(dr$B, ds$B) / sqrt(2 * truedim) |             error[sim, key] <- subspace.dist(dr$B, ds$B) / sqrt(2 * truedim) | ||||||
|             time[sim, key] <- dr.time["elapsed"] |             time[sim, key] <- dr.time["elapsed"] | ||||||
| 
 | 
 | ||||||
|             # Log results to file (mostly for long running simulations) |             # Log results to file (mostly for long running simulations) | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user