From 79f5e9781f81c57f908e7d84a49f4c20e28b453a Mon Sep 17 00:00:00 2001
From: daniel <daniel@kapla.at>
Date: Tue, 18 Jan 2022 12:30:05 +0100
Subject: [PATCH] fix: RMReg

---
 tensorPredictors/R/RMReg.R | 205 ++++++++++++++++++-------------------
 1 file changed, 99 insertions(+), 106 deletions(-)

diff --git a/tensorPredictors/R/RMReg.R b/tensorPredictors/R/RMReg.R
index 9c01454..35208f1 100644
--- a/tensorPredictors/R/RMReg.R
+++ b/tensorPredictors/R/RMReg.R
@@ -7,10 +7,8 @@
 #' log-likelihood as the loss. \eqn{J(B) = f(\sigma(B))}, where \eqn{f} is a
 #' function of the singular values of \eqn{B}.
 #'
-#' The default parameterization is a nuclear norm penalized least squares regression.
-#'
-#' The least squares loss combined with \eqn{f(s) = \lambda \sum_i |s_i|}
-#' corresponds to the nuclear norm regularization problem.
+#' Currently, only the least squares problem with nuclear norm penalty is
+#' implemented.
 #'
 #' In case of \code{lambda = Inf} the maximum penalty \eqn{\lambda} is computed.
 #' In this case the return value is only estimate as a single value.
@@ -24,39 +22,39 @@
 #'  For regression with intercept set \code{Z = rep(1, n)})
 #' @param y univariate response vector
 #' @param lambda penalty term, if set to \code{Inf} max lambda is computed.
-#' @param loss loss function, part of the objective function
-#' @param penalty penalty function with a vector of the singular values if the
-#'  current iterate as arguments. The default function
-#'  \code{function(sigma) sum(sigma)} is the nuclear norm penalty.
-#' @param grad.loss gradient with respect to \eqn{B} of the loss function
-#'  (required, there is no support for numerical gradients)
+#' @param max.iter maximum number of gadient updates
+#' @param max.line.iter maximum number of line search iterations
 #' @param shape Shape of the matrix valued predictors. Required iff the
 #'  predictors \code{X} are provided in vectorized form, e.g. as a 2D matrix.
 #' @param step.size max. stepsize for gradient updates
-#' @param alpha iterative Nesterov momentum scaling values
 #' @param B0 initial value for optimization. Matrix of dimensions \eqn{p\times q}
-#' @param beta initial value of additional covatiates coefficient for \eqn{Z}
-#' @param max.iter maximum number of gadient updates
-#' @param max.line.iter maximum number of line search iterations
+#' @param beta0 initial value of additional covatiates coefficient for \eqn{Z}
+#' @param alpha iterative Nesterov momentum scaling values
+#' @param eps precition for main loop break conditions
 #' @param logger logging callback invoced after every line search before break
 #'  condition checks. The expected function signature is of the form
-#'  \code{logger(iter, loss, penalty, grad, B, beta, step.size)}.
+#'  \code{function(iter, loss, penalty, B, beta, step.size)}.
 #'
 #' @export
-RMReg <- function(X, Z, y, lambda = 0,
-    loss = function(B, beta, X, Z, y) 0.5 * sum((y - Z %*% beta - X %*% c(B))^2),
-    penalty = function(sigma) sum(sigma),
-    grad.loss = function(B, beta, X, Z, y) crossprod(X %*% c(B) + Z %*% beta - y, X),
-    shape = dim(X)[-1],
-    step.size = 1e-3,
-    alpha = function(a, t) { (1 + sqrt(1 + (2 * a)^2)) / 2 },
+RMReg <- function(X, Z, y, lambda = 0, max.iter = 500L, max.line.iter = 50L,
+    shape = dim(X)[-1], step.size = 1e-3,
     B0 = array(0, dim = shape),
-    beta = rep(0, NCOL(Z)),
-    max.iter = 500,
-    max.line.iter = ceiling(log(step.size / sqrt(.Machine$double.eps), 2)),
+    beta0 = rep(0, NCOL(Z)),
+    alpha = function(a, t) { (1 + sqrt(1 + (2 * a)^2)) / 2 },
+    eps = .Machine$double.eps,
     logger = NULL
 ) {
-    ### Check (prepair) params
+    # Define loss (without penalty)
+    loss <- function(B, beta, X, Z, y) 0.5 * sum((y - Z %*% beta - X %*% c(B))^2)
+    # gradient of loss (without penalty)
+    grad <- function(B, beta, X, Z, y) {
+        inner <- X %*% c(B) + Z %*% beta - y
+        list(beta = c(crossprod(inner, Z)), B = c(crossprod(inner, X)))
+    }
+    # # and the penalty function (as function of singular values)
+    # penalty <- function(sigma) sum(sigma)
+
+    # Check (prepair) params
     stopifnot(nrow(X) == length(y))
     if (!missing(shape)) {
         stopifnot(ncol(X) == prod(shape))
@@ -66,130 +64,125 @@ RMReg <- function(X, Z, y, lambda = 0,
     }
     if (missing(Z) || is.null(Z)) {
         Z <- matrix(0, nrow(X), 1)
-        ZZiZ <- NULL
-    } else {
-        if (!is.matrix(Z)) Z <- as.matrix(Z)
-        # Compute (Z' Z)^{-1} Z used to solve for beta. This is constant
-        # throughout and the variable name stands for "((Z' Z) Inverse) Z"
-        ZZiZ <- solve(crossprod(Z, Z), t(Z))
+    } else if (!is.matrix(Z)) {
+        Z <- as.matrix(Z)
     }
 
-    ### Set initial values
-    # Note: Naming convention; a name ending with 1 is the current iterate while
-    # names ending with 0 are the previous iterate value.
-    # Init singular values of B1 (require only current point, not previous B0)
+    # Set singular values of start matrix predictor coefficients
     if (missing(B0)) {
-        b1 <- rep(0, min(shape))
+        B1.sv <- rep(0, min(shape))
     } else {
-        b1 <- La.svd(B0, 0, 0)$d
+        B1.sv <- La.svd(B0, 0, 0)$d
     }
-    # Init current to previous (start position)
+    # initialize current and previous coefficients (start position)
     B1 <- B0
-    a0 <- 0
-    a1 <- 1
-    loss1 <- loss(B1, beta, X, Z, y)
+    beta1 <- beta0
+    alpha0 <- 0
+    alpha1 <- 1
+    loss0 <- loss1 <- loss(B1, beta1, X, Z, y)
 
-    # Start without, the nesterov momentum is zero anyway
-    no.nesterov <- TRUE # Set to FALSE after the first iteration
-    ### Repeat untill convergence
-    for (iter in 1:max.iter) {
-        # Extrapolation with Nesterov Momentum
+    # main descent loop
+    no.nesterov <- FALSE
+    for (iter in seq_len(max.iter)) {
         if (no.nesterov) {
+            # classic gradient step as fallback
             S <- B1
+            s <- beta1
         } else {
-            S <- B1 + ((a0 - 1) / a1) * (B1 - B0)
+            # momentum step (extrapolation using previous direction)
+            S <- B1 + ((alpha0 - 1) / alpha1) * (B1 - B0)
+            s <- beta1 + ((alpha0 - 1) / alpha1) * (beta1 - beta0)
         }
 
-        # Solve for beta at extrapolation point
-        if (!is.null(ZZiZ)) {
-            beta <- ZZiZ %*% (y - X %*% c(S))
-        }
+        # compute (nesterov) gradient
+        G <- grad(S, s, X, Z, y)
 
-        # Compute Nesterov Gradient of the Loss
-        grad <- array(grad.loss(S, beta, X, Z, y), dim = shape)
-
-        # Line Search (executed at least once)
-        for (delta in step.size * 0.5^seq(0, max.line.iter - 1)) {
-            # (potential) next step with delta as stepsize for gradient update
-            A <- S - delta * grad
+        # backtracking line search (executed at least once)
+        for (delta in step.size * 0.5^seq(0, max.line.iter - 1L)) {
+            # Gradient step with step size delta
+            A <- S - delta * G$B
+            beta.temp <- s - delta * G$beta
 
             if (lambda == Inf) {
-                # Application of Corollary 1 (only nuclear norm supported) to
-                # estimate maximum lambda. In this case (first time this line is
-                # hit when lambda set to Inf, then B is zero (ignore B0 param))
-                lambda.max <- max(La.svd(A, 0, 0)$d) / delta
-                return(lambda.max)
+                # Application of Corollary 1 for estimation of max lambda
+                # Return max lambda estimate
+                return(max(La.svd(A, 0, 0)$d) / delta)
             } else if (lambda > 0) {
                 # SVD of (potential) next step
                 svdA <- La.svd(A)
 
-                # Get (potential) next penalized iterate (nuclear norm version only)
-                b.temp <- pmax(0, svdA$d - delta * lambda)  # Singular values of B.temp
-                B.temp <- svdA$u %*% (b.temp * svdA$vt)
+                # Next (possible) penalized iterate
+                B.temp.sv <- pmax(0, svdA$d - delta * lambda)
+                B.temp <- svdA$u %*% (B.temp.sv * svdA$vt)
             } else {
-                # in case of no penalization (pure least squares solution)
-                b.temp <- La.svd(A, 0, 0)$d
+                # in case of no penalization (pure least squares)
+                B.temp.sv <- La.svd(A, 0, 0)$d
                 B.temp <- A
             }
 
-            # Solve for beta at (potential) next step
-            if (!is.null(ZZiZ)) {
-                beta <- ZZiZ %*% (y - X %*% c(B.temp))
-            }
-
-            # Check line search break condition
+            # Check line search condition
             #   h(B.temp) <= g(B.temp | S, delta)
             #  \_ left _/    \_____ right _____/
             # where g(B.temp | S, delta) is the first order approx. of the loss
             #   l(S) + <grad l(S), B - S> + | B - S |_F^2 / 2 delta + J(B)
-            left <- loss(B.temp, beta, X, Z, y) # + penalty(b.temp)
-            right <- loss(S, beta, X, Z, y) + sum(grad * (B1 - S)) +
-                norm(B1 - S, 'F')^2 / (2 * delta) # + penalty(b.temp)
+            left <- loss(B.temp, beta.temp, X, Z, y)
+            right <- loss(S, s, X, Z, y) +
+                sum(G$B * (B.temp - S)) + sum(G$beta * (beta.temp - s)) +
+                (norm(B.temp - S, 'F')^2 + sum((beta.temp - s)^2)) / (2 * delta)
             if (left <= right) {
                 break
             }
         }
 
-        # Evaluate loss at (potential) new parameters
-        loss.temp <- loss(B.temp, beta, X, Z, y)
+        # Evaluate loss to ensure descent after line search
+        loss.temp <- loss(B.temp, beta.temp, X, Z, y)
 
         # logging callback
         if (is.function(logger)) {
-            logger(iter, loss.temp, penalty(b.temp), grad, B1, beta, delta)
+            logger(iter, loss.temp, lambda * sum(B.temp.sv),
+                   B.temp, beta.temp, delta)
         }
 
-        # After gradient update enforce descent (stop if not decreasing)
-        if (loss.temp + penalty(b.temp) <= loss1 + penalty(b1)) {
-            no.nesterov <- FALSE
-            loss1 <- loss.temp
+        # after line search enforce descent
+        if (loss.temp + lambda * sum(B.temp.sv) <= loss1 + lambda * sum(B1.sv)) {
             B0 <- B1
-            B1 <- B.temp
-            b1 <- b.temp
+            B1 <- array(B.temp, shape)
+            B1.sv <- B.temp.sv
+            beta0 <- beta1
+            beta1 <- beta.temp
+            loss0 <- loss1
+            loss1 <- loss.temp
+            no.nesterov <- FALSE    # always reset
         } else if (!no.nesterov) {
-            # Retry without Nesterov extrapolation
-            no.nesterov <- TRUE
+            no.nesterov <- TRUE     # retry without momentum
             next
         } else {
-            break
+            break                   # failed even without momentum -> stop
         }
 
-        # If estimate is zero, stop algorithm
-        if (all(b.temp < .Machine$double.eps)) {
-            loss1 <- loss.temp
-            B1 <- array(0, dim = shape)
-            b1 <- rep(0, min(shape))
-            break
+        # check break conditions
+        if (sum(B1.sv) < eps) {
+            break   # estimate is (numerically) zero -> stop
+        }
+        if ((sum(G$B^2) + sum(G$beta^2)) < eps * sum(unlist(Map(length, G)))) {
+            break   # mean squared gradient is smaller than epsilon -> stop
+        }
+        if (abs(loss0 - loss1) < eps) {
+            break   # decrease is smaller than epsilon -> stop
         }
 
-        # Update momentum scaling
-        a0 <- a1
-        a1 <- alpha(a1, iter)
+        # update momentum scaling
+        alpha0 <- alpha1
+        alpha1 <- alpha(alpha1, iter)
+
+        # set step size to two times current delta
+        step.size <- 2 * delta
     }
 
-    ### Degrees of Freedom estimate (TODO: this is like in `matrix_sparsereg.m`)
+    # Degrees of Freedom estimate (TODO: this is like in `matrix_sparsereg.m`)
     sigma <- c(La.svd(A, 0, 0)$d, rep(0, max(shape) - min(shape)))
-    df <- if (!is.null(ZZiZ)) { ncol(Z) } else { 0 }
-    for (i in seq_len(sum(b1 > 0))) {
+    df <- length(beta1)
+    for (i in seq_len(sum(B1.sv > 0))) {
         df <- df + 1 + sigma[i] * (sigma[i] - delta * lambda) * (
             sum(ifelse((1:shape[1]) != i, 1 / (sigma[i]^2 - sigma[1:shape[1]]^2), 0)) +
             sum(ifelse((1:shape[2]) != i, 1 / (sigma[i]^2 - sigma[1:shape[2]]^2), 0)))
@@ -198,14 +191,14 @@ RMReg <- function(X, Z, y, lambda = 0,
     # return estimates and some additional stats
     list(
         B = B1,
-        beta = if(is.null(ZZiZ)) { NULL } else { beta },
-        singular.values = b1,
+        beta = beta1,
+        singular.values = B1.sv,
         iter = iter,
         df = df,
         loss = loss1,
         lambda = lambda,
-        AIC = loss1 / var(y) + 2 * df,
-        BIC = loss1 / var(y) + log(nrow(X)) * df,
+        AIC = loss1 + 2 * df,               # TODO: check this!
+        BIC = loss1 + log(nrow(X)) * df,    # TODO: check this!
         call = match.call() # invocing function call, collects params like lambda
     )
 }