From 79f5e9781f81c57f908e7d84a49f4c20e28b453a Mon Sep 17 00:00:00 2001 From: daniel Date: Tue, 18 Jan 2022 12:30:05 +0100 Subject: [PATCH] fix: RMReg --- tensorPredictors/R/RMReg.R | 205 ++++++++++++++++++------------------- 1 file changed, 99 insertions(+), 106 deletions(-) diff --git a/tensorPredictors/R/RMReg.R b/tensorPredictors/R/RMReg.R index 9c01454..35208f1 100644 --- a/tensorPredictors/R/RMReg.R +++ b/tensorPredictors/R/RMReg.R @@ -7,10 +7,8 @@ #' log-likelihood as the loss. \eqn{J(B) = f(\sigma(B))}, where \eqn{f} is a #' function of the singular values of \eqn{B}. #' -#' The default parameterization is a nuclear norm penalized least squares regression. -#' -#' The least squares loss combined with \eqn{f(s) = \lambda \sum_i |s_i|} -#' corresponds to the nuclear norm regularization problem. +#' Currently, only the least squares problem with nuclear norm penalty is +#' implemented. #' #' In case of \code{lambda = Inf} the maximum penalty \eqn{\lambda} is computed. #' In this case the return value is only estimate as a single value. @@ -24,39 +22,39 @@ #' For regression with intercept set \code{Z = rep(1, n)}) #' @param y univariate response vector #' @param lambda penalty term, if set to \code{Inf} max lambda is computed. -#' @param loss loss function, part of the objective function -#' @param penalty penalty function with a vector of the singular values if the -#' current iterate as arguments. The default function -#' \code{function(sigma) sum(sigma)} is the nuclear norm penalty. -#' @param grad.loss gradient with respect to \eqn{B} of the loss function -#' (required, there is no support for numerical gradients) +#' @param max.iter maximum number of gadient updates +#' @param max.line.iter maximum number of line search iterations #' @param shape Shape of the matrix valued predictors. Required iff the #' predictors \code{X} are provided in vectorized form, e.g. as a 2D matrix. #' @param step.size max. stepsize for gradient updates -#' @param alpha iterative Nesterov momentum scaling values #' @param B0 initial value for optimization. Matrix of dimensions \eqn{p\times q} -#' @param beta initial value of additional covatiates coefficient for \eqn{Z} -#' @param max.iter maximum number of gadient updates -#' @param max.line.iter maximum number of line search iterations +#' @param beta0 initial value of additional covatiates coefficient for \eqn{Z} +#' @param alpha iterative Nesterov momentum scaling values +#' @param eps precition for main loop break conditions #' @param logger logging callback invoced after every line search before break #' condition checks. The expected function signature is of the form -#' \code{logger(iter, loss, penalty, grad, B, beta, step.size)}. +#' \code{function(iter, loss, penalty, B, beta, step.size)}. #' #' @export -RMReg <- function(X, Z, y, lambda = 0, - loss = function(B, beta, X, Z, y) 0.5 * sum((y - Z %*% beta - X %*% c(B))^2), - penalty = function(sigma) sum(sigma), - grad.loss = function(B, beta, X, Z, y) crossprod(X %*% c(B) + Z %*% beta - y, X), - shape = dim(X)[-1], - step.size = 1e-3, - alpha = function(a, t) { (1 + sqrt(1 + (2 * a)^2)) / 2 }, +RMReg <- function(X, Z, y, lambda = 0, max.iter = 500L, max.line.iter = 50L, + shape = dim(X)[-1], step.size = 1e-3, B0 = array(0, dim = shape), - beta = rep(0, NCOL(Z)), - max.iter = 500, - max.line.iter = ceiling(log(step.size / sqrt(.Machine$double.eps), 2)), + beta0 = rep(0, NCOL(Z)), + alpha = function(a, t) { (1 + sqrt(1 + (2 * a)^2)) / 2 }, + eps = .Machine$double.eps, logger = NULL ) { - ### Check (prepair) params + # Define loss (without penalty) + loss <- function(B, beta, X, Z, y) 0.5 * sum((y - Z %*% beta - X %*% c(B))^2) + # gradient of loss (without penalty) + grad <- function(B, beta, X, Z, y) { + inner <- X %*% c(B) + Z %*% beta - y + list(beta = c(crossprod(inner, Z)), B = c(crossprod(inner, X))) + } + # # and the penalty function (as function of singular values) + # penalty <- function(sigma) sum(sigma) + + # Check (prepair) params stopifnot(nrow(X) == length(y)) if (!missing(shape)) { stopifnot(ncol(X) == prod(shape)) @@ -66,130 +64,125 @@ RMReg <- function(X, Z, y, lambda = 0, } if (missing(Z) || is.null(Z)) { Z <- matrix(0, nrow(X), 1) - ZZiZ <- NULL - } else { - if (!is.matrix(Z)) Z <- as.matrix(Z) - # Compute (Z' Z)^{-1} Z used to solve for beta. This is constant - # throughout and the variable name stands for "((Z' Z) Inverse) Z" - ZZiZ <- solve(crossprod(Z, Z), t(Z)) + } else if (!is.matrix(Z)) { + Z <- as.matrix(Z) } - ### Set initial values - # Note: Naming convention; a name ending with 1 is the current iterate while - # names ending with 0 are the previous iterate value. - # Init singular values of B1 (require only current point, not previous B0) + # Set singular values of start matrix predictor coefficients if (missing(B0)) { - b1 <- rep(0, min(shape)) + B1.sv <- rep(0, min(shape)) } else { - b1 <- La.svd(B0, 0, 0)$d + B1.sv <- La.svd(B0, 0, 0)$d } - # Init current to previous (start position) + # initialize current and previous coefficients (start position) B1 <- B0 - a0 <- 0 - a1 <- 1 - loss1 <- loss(B1, beta, X, Z, y) + beta1 <- beta0 + alpha0 <- 0 + alpha1 <- 1 + loss0 <- loss1 <- loss(B1, beta1, X, Z, y) - # Start without, the nesterov momentum is zero anyway - no.nesterov <- TRUE # Set to FALSE after the first iteration - ### Repeat untill convergence - for (iter in 1:max.iter) { - # Extrapolation with Nesterov Momentum + # main descent loop + no.nesterov <- FALSE + for (iter in seq_len(max.iter)) { if (no.nesterov) { + # classic gradient step as fallback S <- B1 + s <- beta1 } else { - S <- B1 + ((a0 - 1) / a1) * (B1 - B0) + # momentum step (extrapolation using previous direction) + S <- B1 + ((alpha0 - 1) / alpha1) * (B1 - B0) + s <- beta1 + ((alpha0 - 1) / alpha1) * (beta1 - beta0) } - # Solve for beta at extrapolation point - if (!is.null(ZZiZ)) { - beta <- ZZiZ %*% (y - X %*% c(S)) - } + # compute (nesterov) gradient + G <- grad(S, s, X, Z, y) - # Compute Nesterov Gradient of the Loss - grad <- array(grad.loss(S, beta, X, Z, y), dim = shape) - - # Line Search (executed at least once) - for (delta in step.size * 0.5^seq(0, max.line.iter - 1)) { - # (potential) next step with delta as stepsize for gradient update - A <- S - delta * grad + # backtracking line search (executed at least once) + for (delta in step.size * 0.5^seq(0, max.line.iter - 1L)) { + # Gradient step with step size delta + A <- S - delta * G$B + beta.temp <- s - delta * G$beta if (lambda == Inf) { - # Application of Corollary 1 (only nuclear norm supported) to - # estimate maximum lambda. In this case (first time this line is - # hit when lambda set to Inf, then B is zero (ignore B0 param)) - lambda.max <- max(La.svd(A, 0, 0)$d) / delta - return(lambda.max) + # Application of Corollary 1 for estimation of max lambda + # Return max lambda estimate + return(max(La.svd(A, 0, 0)$d) / delta) } else if (lambda > 0) { # SVD of (potential) next step svdA <- La.svd(A) - # Get (potential) next penalized iterate (nuclear norm version only) - b.temp <- pmax(0, svdA$d - delta * lambda) # Singular values of B.temp - B.temp <- svdA$u %*% (b.temp * svdA$vt) + # Next (possible) penalized iterate + B.temp.sv <- pmax(0, svdA$d - delta * lambda) + B.temp <- svdA$u %*% (B.temp.sv * svdA$vt) } else { - # in case of no penalization (pure least squares solution) - b.temp <- La.svd(A, 0, 0)$d + # in case of no penalization (pure least squares) + B.temp.sv <- La.svd(A, 0, 0)$d B.temp <- A } - # Solve for beta at (potential) next step - if (!is.null(ZZiZ)) { - beta <- ZZiZ %*% (y - X %*% c(B.temp)) - } - - # Check line search break condition + # Check line search condition # h(B.temp) <= g(B.temp | S, delta) # \_ left _/ \_____ right _____/ # where g(B.temp | S, delta) is the first order approx. of the loss # l(S) + + | B - S |_F^2 / 2 delta + J(B) - left <- loss(B.temp, beta, X, Z, y) # + penalty(b.temp) - right <- loss(S, beta, X, Z, y) + sum(grad * (B1 - S)) + - norm(B1 - S, 'F')^2 / (2 * delta) # + penalty(b.temp) + left <- loss(B.temp, beta.temp, X, Z, y) + right <- loss(S, s, X, Z, y) + + sum(G$B * (B.temp - S)) + sum(G$beta * (beta.temp - s)) + + (norm(B.temp - S, 'F')^2 + sum((beta.temp - s)^2)) / (2 * delta) if (left <= right) { break } } - # Evaluate loss at (potential) new parameters - loss.temp <- loss(B.temp, beta, X, Z, y) + # Evaluate loss to ensure descent after line search + loss.temp <- loss(B.temp, beta.temp, X, Z, y) # logging callback if (is.function(logger)) { - logger(iter, loss.temp, penalty(b.temp), grad, B1, beta, delta) + logger(iter, loss.temp, lambda * sum(B.temp.sv), + B.temp, beta.temp, delta) } - # After gradient update enforce descent (stop if not decreasing) - if (loss.temp + penalty(b.temp) <= loss1 + penalty(b1)) { - no.nesterov <- FALSE - loss1 <- loss.temp + # after line search enforce descent + if (loss.temp + lambda * sum(B.temp.sv) <= loss1 + lambda * sum(B1.sv)) { B0 <- B1 - B1 <- B.temp - b1 <- b.temp + B1 <- array(B.temp, shape) + B1.sv <- B.temp.sv + beta0 <- beta1 + beta1 <- beta.temp + loss0 <- loss1 + loss1 <- loss.temp + no.nesterov <- FALSE # always reset } else if (!no.nesterov) { - # Retry without Nesterov extrapolation - no.nesterov <- TRUE + no.nesterov <- TRUE # retry without momentum next } else { - break + break # failed even without momentum -> stop } - # If estimate is zero, stop algorithm - if (all(b.temp < .Machine$double.eps)) { - loss1 <- loss.temp - B1 <- array(0, dim = shape) - b1 <- rep(0, min(shape)) - break + # check break conditions + if (sum(B1.sv) < eps) { + break # estimate is (numerically) zero -> stop + } + if ((sum(G$B^2) + sum(G$beta^2)) < eps * sum(unlist(Map(length, G)))) { + break # mean squared gradient is smaller than epsilon -> stop + } + if (abs(loss0 - loss1) < eps) { + break # decrease is smaller than epsilon -> stop } - # Update momentum scaling - a0 <- a1 - a1 <- alpha(a1, iter) + # update momentum scaling + alpha0 <- alpha1 + alpha1 <- alpha(alpha1, iter) + + # set step size to two times current delta + step.size <- 2 * delta } - ### Degrees of Freedom estimate (TODO: this is like in `matrix_sparsereg.m`) + # Degrees of Freedom estimate (TODO: this is like in `matrix_sparsereg.m`) sigma <- c(La.svd(A, 0, 0)$d, rep(0, max(shape) - min(shape))) - df <- if (!is.null(ZZiZ)) { ncol(Z) } else { 0 } - for (i in seq_len(sum(b1 > 0))) { + df <- length(beta1) + for (i in seq_len(sum(B1.sv > 0))) { df <- df + 1 + sigma[i] * (sigma[i] - delta * lambda) * ( sum(ifelse((1:shape[1]) != i, 1 / (sigma[i]^2 - sigma[1:shape[1]]^2), 0)) + sum(ifelse((1:shape[2]) != i, 1 / (sigma[i]^2 - sigma[1:shape[2]]^2), 0))) @@ -198,14 +191,14 @@ RMReg <- function(X, Z, y, lambda = 0, # return estimates and some additional stats list( B = B1, - beta = if(is.null(ZZiZ)) { NULL } else { beta }, - singular.values = b1, + beta = beta1, + singular.values = B1.sv, iter = iter, df = df, loss = loss1, lambda = lambda, - AIC = loss1 / var(y) + 2 * df, - BIC = loss1 / var(y) + log(nrow(X)) * df, + AIC = loss1 + 2 * df, # TODO: check this! + BIC = loss1 + log(nrow(X)) * df, # TODO: check this! call = match.call() # invocing function call, collects params like lambda ) }