From 2ddbe0e90d856360ce035f742c61bf6266793aed Mon Sep 17 00:00:00 2001
From: daniel <daniel@kapla.at>
Date: Thu, 9 Dec 2021 13:21:38 +0100
Subject: [PATCH] wip: Regularized Matrix Regression

---
 tensorPredictors/NAMESPACE                    |  4 +
 tensorPredictors/R/RMReg.R                    | 75 +++++++++++++++----
 tensorPredictors/R/matpow.R                   |  2 +-
 .../R/{plot_matrix.R => matrixImage.R}        |  7 +-
 tensorPredictors/R/random.R                   |  2 +-
 5 files changed, 69 insertions(+), 21 deletions(-)
 rename tensorPredictors/R/{plot_matrix.R => matrixImage.R} (84%)

diff --git a/tensorPredictors/NAMESPACE b/tensorPredictors/NAMESPACE
index 4db235d..595827d 100644
--- a/tensorPredictors/NAMESPACE
+++ b/tensorPredictors/NAMESPACE
@@ -4,8 +4,12 @@ export(CISE)
 export(LSIR)
 export(PCA2d)
 export(POI)
+export(RMReg)
 export(approx.kronecker)
+export(dist.projection)
 export(dist.subspace)
+export(matpow)
+export(matrixImage)
 export(reduce)
 import(stats)
 useDynLib(tensorPredictors, .registration = TRUE)
diff --git a/tensorPredictors/R/RMReg.R b/tensorPredictors/R/RMReg.R
index 5b952e4..8b9b9bb 100644
--- a/tensorPredictors/R/RMReg.R
+++ b/tensorPredictors/R/RMReg.R
@@ -17,12 +17,14 @@
 #'  first dimension are the observations while the second and third are the
 #'  `image' dimensions. When the data is provided as a matix it's assumed to be
 #'  of shape \eqn{n\times p q} where each observation is the vectorid `image'.
+#' @param Z additional covariate vector (can be \code{NULL} if not required.
+#'  For regression with intercept set \code{Z = rep(1, n)})
 #' @param y univariate response vector
 #' @param lambda penalty term (note: range between 0 and max. signular value
 #'  of the least squares solution gives non-trivial results)
-#' @param loss loss function part of the objective function
-#' @param grad.loss gradient of the loss function evaluated (required, there is
-#'  no support for numerical gradients)
+#' @param loss loss function, part of the objective function
+#' @param grad.loss gradient with respect to \eqn{B} of the loss function
+#'  (required, there is no support for numerical gradients)
 #' @param penalty penalty function with a vector of the singular values if the
 #'  current iterate as arguments. The default function
 #'  \code{function(sigma) sum(sigma)} is the nuclear norm penalty.
@@ -31,18 +33,20 @@
 #' @param step.size max. stepsize for gradient updates
 #' @param alpha iterative Nesterov momentum scaling values
 #' @param B0 initial value for optimization. Matrix of dimensions \eqn{p\times q}
+#' @param beta initial value of additional covatiates coefficient for \eqn{Z}
 #' @param max.iter maximum number of gadient updates
 #' @param max.line.iter maximum number of line search iterations
 #'
 #' @export
-RMReg <- function(X, y, lambda = 0,
-    loss = function(B, X, y) 0.5 * sum((y - X %*% c(B))^2),
-    grad.loss = function(B, X, y) crossprod(X %*% c(B) - y, X),
+RMReg <- function(X, Z, y, lambda = 0,
+    loss = function(B, beta, X, Z, y) 0.5 * sum((y - Z %*% beta - X %*% c(B))^2),
+    grad.loss = function(B, beta, X, Z, y) crossprod(X %*% c(B) + Z %*% beta - y, X),
     penalty = function(sigma) sum(sigma),
     shape = dim(X)[-1],
     step.size = 1e-3,
     alpha = function(a, t) { (1 + sqrt(1 + (2 * a)^2)) / 2 },
     B0 = array(0, dim = shape),
+    beta = rep(0, NCOL(Z)),
     max.iter = 500,
     max.line.iter = ceiling(log(step.size / sqrt(.Machine$double.eps), 2))
 ) {
@@ -54,7 +58,14 @@ RMReg <- function(X, y, lambda = 0,
         stopifnot(length(dim(X)) == 3)
         dim(X) <- c(nrow(X), prod(shape))
     }
-
+    if (missing(Z) || is.null(Z)) {
+        Z <- matrix(0, nrow(X), 1)
+        ZZiZ <- NULL
+    } else {
+        # Compute (Z' Z)^{-1} Z used to solve for beta. This is constant
+        # throughout and the variable name stands for "((Z' Z) Inverse) Z"
+        ZZiZ <- solve(crossprod(Z, Z), t(Z))
+    }
 
     ### Set initial values
     # singular values of B1 (require only current point, not previous B0)
@@ -66,15 +77,20 @@ RMReg <- function(X, y, lambda = 0,
     B1 <- B0
     a0 <- 0
     a1 <- 1
-    loss1 <- loss(B1, X, y)
+    loss1 <- loss(B1, beta, X, Z, y)
 
     ### Repeat untill convergence
-    for (iter in max.iter) {
+    for (iter in 1:max.iter) {
         # Extrapolation (Nesterov Momentum)
         S <- B1 + ((a0 - 1) / a1) * (B1 - B0)
 
+        # Solve for beta at extrapolation point
+        if (!is.null(ZZiZ)) {
+            beta <- ZZiZ %*% (y - X %*% c(S))
+        }
+
         # Compute Nesterov Gradient of the Loss
-        grad <- array(grad.loss(S, X, y), dim = shape)
+        grad <- array(grad.loss(S, beta, X, Z, y), dim = shape)
 
         # Line Search (executed at least once)
         for (delta in step.size * 0.5^seq(0, max.line.iter - 1)) {
@@ -94,34 +110,61 @@ RMReg <- function(X, y, lambda = 0,
                 B.temp <- A
             }
 
+            # Solve for beta at (potential) next step
+            if (!is.null(ZZiZ)) {
+                beta <- ZZiZ %*% (y - X %*% c(B.temp))
+            }
+
             # Check line search break condition
             #   h(B.temp) <= g(B.temp | S, delta)
             #  \_ left _/    \_____ right _____/
             # where g(B.temp | S, delta) is the first order approx. of the loss
             #   l(S) + <grad l(S), B - S> + | B - S |_F^2 / 2 delta + J(B)
-            left <- loss(B.temp, X, y) # + penalty(b.temp)
-            right <- loss(S, X, y) + sum(grad * (B1 - S)) +
+            left <- loss(B.temp, beta, X, Z, y) # + penalty(b.temp)
+            right <- loss(S, beta, X, Z, y) + sum(grad * (B1 - S)) +
                 norm(B1 - S, 'F')^2 / (2 * delta) # + penalty(b.temp)
             if (left <= right) {
                 break
             }
         }
 
-        # After gradient update enforce descent
-        loss.temp <- loss(B.temp, X, y)
+        # After gradient update enforce descent (stop if not decreasing)
+        loss.temp <- loss(B.temp, beta, X, Z, y)
         if (loss.temp + penalty(b.temp) <= loss1 + penalty(b1)) {
             loss1 <- loss.temp
             B0 <- B1
             B1 <- B.temp
+            b1 <- b.temp
         } else {
             break
         }
 
+        # Stop if estimate is zero
+        if (all(b1 < .Machine$double.eps)) {
+            break
+        }
+
         # Update momentum scaling
         a0 <- a1
         a1 <- alpha(a1, iter)
     }
 
-    # return estimate
-    B1
+    # ### Degrees of Freedom estimate
+    # df <- 
+
+    # return estimates and some additional stats
+    list(
+        B = B1,
+        beta = if(is.null(ZZiZ)) { NULL } else { beta },
+        singular.values = b1,
+        iter = iter,
+        df = "?",
+        AIC = "?",
+        BIC = "?",
+        call = match.call() # invocing function call, collects params like lambda
+    )
 }
+
+rmr.fit <- RMReg(X, NULL, y, lambda = 0)
+rmr.fit$iter
+rmr.fit$singular.values[rmr.fit$singular.values > 0]
diff --git a/tensorPredictors/R/matpow.R b/tensorPredictors/R/matpow.R
index bfd2907..d96aeaa 100644
--- a/tensorPredictors/R/matpow.R
+++ b/tensorPredictors/R/matpow.R
@@ -56,7 +56,7 @@
 #' all.equal(A %*% B, t(A %*% B))
 #' }
 #'
-#' @keywords internal
+#' @export
 matpow <- function(A, pow, tol = 1e-7) {
     if (nrow(A) != ncol(A)) {
         stop("Expected a square matix, but 'A' is ", nrow(A), " by ", ncol(A))
diff --git a/tensorPredictors/R/plot_matrix.R b/tensorPredictors/R/matrixImage.R
similarity index 84%
rename from tensorPredictors/R/plot_matrix.R
rename to tensorPredictors/R/matrixImage.R
index 46bf818..3cf4d96 100644
--- a/tensorPredictors/R/plot_matrix.R
+++ b/tensorPredictors/R/matrixImage.R
@@ -1,20 +1,21 @@
-#' Plots a matrix an image
+#' Plots a matrix as an image
 #'
 #' @param A a matrix to be plotted
 #' @param main overall title for the plot
+#' @param sub sub-title of the plot
 #' @param interpolate a logical vector (or scalar) indicating whether to apply
 #'  linear interpolation to the image when drawing.
 #' @param ... further arguments passed to \code{\link{rasterImage}}
 #'
 #' @export
-plot.matrix <- function(A, main = NULL, interpolate = FALSE, ...) {
+matrixImage <- function(A, main = NULL, sub = NULL, interpolate = FALSE, ...) {
 
     # Scale values of `A` to [0, 1] with min mapped to 1 and max to 0.
     A <- (max(A) - A) / diff(range(A))
 
     # plot raster image
     plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "red",
-        xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main)
+        xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main, sub = sub)
 
     # Add X-axis giving index
     ind <- seq(1, ncol(A), by = 1)
diff --git a/tensorPredictors/R/random.R b/tensorPredictors/R/random.R
index f57397f..8aa15f3 100644
--- a/tensorPredictors/R/random.R
+++ b/tensorPredictors/R/random.R
@@ -26,5 +26,5 @@ rmvnorm <- function(n = 1, mu = rep(0, p), sigma = diag(p)) {
     }
 
     # See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution
-    return(rep(mu, each = n) + matrix(rnorm(n * p), n) %*% chol(sigma))
+    rep(mu, each = n) + matrix(rnorm(n * p), n) %*% chol(sigma)
 }