diff --git a/tensorPredictors/R/RMReg.R b/tensorPredictors/R/RMReg.R new file mode 100644 index 0000000..5b952e4 --- /dev/null +++ b/tensorPredictors/R/RMReg.R @@ -0,0 +1,127 @@ +#' Regularized Matrix Regression +#' +#' Solved the regularized problem +#' \deqn{min h(B) = l(B) + J(B)} +#' for a matrix \eqn{B}. +#' where \eqn{l} is a loss function; for the GLM, we use the negative +#' log-likelihood as the loss. \eqn{J(B) = f(\sigma(B))}, where \eqn{f} is a +#' function of the singular values of \eqn{B}. +#' +#' The default parameterization is a nuclear norm penalized least squares regression. +#' +#' The least squares loss combined with \eqn{f(s) = \lambda \sum_i |s_i|} +#' corresponds to the nuclear norm regularization problem. +#' +#' @param X the singnal data ether as a 3D tensor or a 2D matrix. In case of a +#' 3D tensor the axis are assumed to be \eqn{n\times p\times q} meaning the +#' first dimension are the observations while the second and third are the +#' `image' dimensions. When the data is provided as a matix it's assumed to be +#' of shape \eqn{n\times p q} where each observation is the vectorid `image'. +#' @param y univariate response vector +#' @param lambda penalty term (note: range between 0 and max. signular value +#' of the least squares solution gives non-trivial results) +#' @param loss loss function part of the objective function +#' @param grad.loss gradient of the loss function evaluated (required, there is +#' no support for numerical gradients) +#' @param penalty penalty function with a vector of the singular values if the +#' current iterate as arguments. The default function +#' \code{function(sigma) sum(sigma)} is the nuclear norm penalty. +#' @param shape Shape of the matrix valued predictors. Required iff the +#' predictors \code{X} are provided in vectorized form, e.g. as a 2D matrix. +#' @param step.size max. stepsize for gradient updates +#' @param alpha iterative Nesterov momentum scaling values +#' @param B0 initial value for optimization. Matrix of dimensions \eqn{p\times q} +#' @param max.iter maximum number of gadient updates +#' @param max.line.iter maximum number of line search iterations +#' +#' @export +RMReg <- function(X, y, lambda = 0, + loss = function(B, X, y) 0.5 * sum((y - X %*% c(B))^2), + grad.loss = function(B, X, y) crossprod(X %*% c(B) - y, X), + penalty = function(sigma) sum(sigma), + shape = dim(X)[-1], + step.size = 1e-3, + alpha = function(a, t) { (1 + sqrt(1 + (2 * a)^2)) / 2 }, + B0 = array(0, dim = shape), + max.iter = 500, + max.line.iter = ceiling(log(step.size / sqrt(.Machine$double.eps), 2)) +) { + ### Check (prepair) params + stopifnot(nrow(X) == length(y)) + if (!missing(shape)) { + stopifnot(ncol(X) == prod(shape)) + } else { + stopifnot(length(dim(X)) == 3) + dim(X) <- c(nrow(X), prod(shape)) + } + + + ### Set initial values + # singular values of B1 (require only current point, not previous B0) + if (missing(B0)) { + b1 <- rep(0, min(shape)) + } else { + b1 <- La.svd(B0, 0, 0)$d + } + B1 <- B0 + a0 <- 0 + a1 <- 1 + loss1 <- loss(B1, X, y) + + ### Repeat untill convergence + for (iter in max.iter) { + # Extrapolation (Nesterov Momentum) + S <- B1 + ((a0 - 1) / a1) * (B1 - B0) + + # Compute Nesterov Gradient of the Loss + grad <- array(grad.loss(S, X, y), dim = shape) + + # Line Search (executed at least once) + for (delta in step.size * 0.5^seq(0, max.line.iter - 1)) { + # (potential) next step with delta as stepsize for gradient update + A <- S - delta * grad + + if (lambda > 0) { + # SVD of (potential) next step + svdA <- La.svd(A) + + # Get (potential) next penalized iterate (nuclear norm version only) + b.temp <- pmax(0, svdA$d - lambda) # Singular values of B.temp + B.temp <- svdA$u %*% (b.temp * svdA$vt) + } else { + # in case of no penalization (pure least squares solution) + b.temp <- La.svd(A, 0, 0)$d + B.temp <- A + } + + # Check line search break condition + # h(B.temp) <= g(B.temp | S, delta) + # \_ left _/ \_____ right _____/ + # where g(B.temp | S, delta) is the first order approx. of the loss + # l(S) + + | B - S |_F^2 / 2 delta + J(B) + left <- loss(B.temp, X, y) # + penalty(b.temp) + right <- loss(S, X, y) + sum(grad * (B1 - S)) + + norm(B1 - S, 'F')^2 / (2 * delta) # + penalty(b.temp) + if (left <= right) { + break + } + } + + # After gradient update enforce descent + loss.temp <- loss(B.temp, X, y) + if (loss.temp + penalty(b.temp) <= loss1 + penalty(b1)) { + loss1 <- loss.temp + B0 <- B1 + B1 <- B.temp + } else { + break + } + + # Update momentum scaling + a0 <- a1 + a1 <- alpha(a1, iter) + } + + # return estimate + B1 +} diff --git a/tensorPredictors/R/dist_projection.R b/tensorPredictors/R/dist_projection.R index 7a5811d..3c66e50 100644 --- a/tensorPredictors/R/dist_projection.R +++ b/tensorPredictors/R/dist_projection.R @@ -3,25 +3,41 @@ #' Defined as sine of the maximum principal angle between the column spaces #' of the matrices #' max{ sin theta_i, i = 1, ..., min(d1, d2) } -#' +#' In case of rank(A) = rank(B) this measure is equivalent to +#' || A A' - B B' ||_2 +#' where ||.||_2 is the spectral norm (max singular value). +#' #' @param A,B matrices of size \eqn{p\times d_1} and \eqn{p\times d_2}. #' #' @export dist.projection <- function(A, B, is.ortho = FALSE, tol = sqrt(.Machine$double.eps) ) { + if (!is.matrix(A)) A <- as.matrix(A) + if (!is.matrix(B)) B <- as.matrix(B) + if (!is.ortho) { qrA <- qr(A, tol) + rankA <- qrA$rank A <- qr.Q(qrA)[, seq_len(qrA$rank), drop = FALSE] qrB <- qr(B, tol) + rankB <- qrB$rank B <- qr.Q(qrB)[, seq_len(qrB$rank), drop = FALSE] + } else { + rankA <- ncol(A) + rankB <- ncol(B) } - if (ncol(A) == 0L && ncol(B) == 0L) { - 0 - } else if (ncol(A) == 0L || ncol(B) == 0L) { - 1 + if (rankA == 0 || rankB == 0) { + return(as.double(rankA != rankB)) + } else if (rankA == 1 && rankB == 1) { + sigma.min <- min(abs(sum(A * B)), 1) } else { - sin(acos(min(c(La.svd(crossprod(A, B), 0, 0)$d, 1)))) + sigma.min <- min(La.svd(crossprod(A, B), 0, 0)$d, 1) + } + if (sigma.min < 0.5) { + sin(acos(sigma.min)) + } else { + cos(asin(sigma.min)) } }