#' Compute get gradient of `L(V)` given a dataset `X`. #' #' @param X Data matrix. #' @param Y Responce. #' @param V Position to compute the gradient at, aka point on Stiefl manifold. #' @param h Bandwidth #' @param loss.out Iff \code{TRUE} loss will be written to parent environment. #' @param loss.only Boolean to only compute the loss, of \code{TRUE} a single #' value loss is returned and \code{envir} is ignored. #' @param persistent Determines if data indices and dependent calculations shall #' be reused from the parent environment. ATTENTION: Do NOT set this flag, only #' intended for internal usage by carefully aligned functions! #' @keywords internal #' @export grad <- function(X, Y, V, h, loss.out = FALSE, loss.only = FALSE, persistent = FALSE) { # Get number of samples and dimension. n <- nrow(X) p <- ncol(X) if (!persistent) { # Compute lookup indexes for symmetrie, lower/upper # triangular parts and vectorization. pair.index <- elem.pairs(seq(n)) i <- pair.index[1, ] # `i` indices of `(i, j)` pairs j <- pair.index[2, ] # `j` indices of `(i, j)` pairs # Index of vectorized matrix, for lower and upper triangular part. lower <- ((i - 1) * n) + j upper <- ((j - 1) * n) + i # Create all pairewise differences of rows of `X`. X_diff <- X[i, , drop = F] - X[j, , drop = F] } # Projection matrix onto `span(V)` Q <- diag(1, p) - tcrossprod(V, V) # Vectorized distance matrix `D`. vecD <- colSums(tcrossprod(Q, X_diff)^2) # Create Kernel matrix (aka. apply kernel to distances) K <- matrix(1, n, n) # `exp(0) == 1` K[lower] <- exp((-0.5 / h) * vecD^2) # Set lower tri. part K[upper] <- t(K)[upper] # Mirror lower tri. to upper # Weighted `Y` momentums colSumsK <- colSums(K) y1 <- (K %*% Y) / colSumsK y2 <- (K %*% Y^2) / colSumsK # Per example loss `L(V, X_i)` L <- y2 - y1^2 if (loss.only) { return(mean(L)) } if (loss.out) { loss <<- mean(L) } # Compute scaling vector `vecS` for `X_diff`. tmp <- kronecker(matrix(y1, n, 1), matrix(Y, 1, n), `-`)^2 tmp <- as.vector(L) - tmp tmp <- tmp * K / colSumsK vecS <- (tmp + t(tmp))[lower] * vecD # The gradient. # 1. The `crossprod(A, B)` is equivalent to `t(A) %*% B`, # 2. `(X_diff %*% V) * vecS` is first a marix matrix mult. and then using # recycling to scale each row with the values of `vecS`. # Note that `vecS` is a vector and that `R` uses column-major ordering # of matrices. # (See: notes for more details) # TODO: Depending on n, p, q decide which version to take (for current # datasets "inner" is faster, see: notes). # inner = crossprod(X_diff, X_diff * vecS) %*% V, # outer = crossprod(X_diff, (X_diff %*% V) * vecS) G <- crossprod(X_diff, X_diff * vecS) %*% V G <- (-2 / (n * h^2)) * G return(G) }