From 2ddbe0e90d856360ce035f742c61bf6266793aed Mon Sep 17 00:00:00 2001 From: daniel Date: Thu, 9 Dec 2021 13:21:38 +0100 Subject: [PATCH] wip: Regularized Matrix Regression --- tensorPredictors/NAMESPACE | 4 + tensorPredictors/R/RMReg.R | 75 +++++++++++++++---- tensorPredictors/R/matpow.R | 2 +- .../R/{plot_matrix.R => matrixImage.R} | 7 +- tensorPredictors/R/random.R | 2 +- 5 files changed, 69 insertions(+), 21 deletions(-) rename tensorPredictors/R/{plot_matrix.R => matrixImage.R} (84%) diff --git a/tensorPredictors/NAMESPACE b/tensorPredictors/NAMESPACE index 4db235d..595827d 100644 --- a/tensorPredictors/NAMESPACE +++ b/tensorPredictors/NAMESPACE @@ -4,8 +4,12 @@ export(CISE) export(LSIR) export(PCA2d) export(POI) +export(RMReg) export(approx.kronecker) +export(dist.projection) export(dist.subspace) +export(matpow) +export(matrixImage) export(reduce) import(stats) useDynLib(tensorPredictors, .registration = TRUE) diff --git a/tensorPredictors/R/RMReg.R b/tensorPredictors/R/RMReg.R index 5b952e4..8b9b9bb 100644 --- a/tensorPredictors/R/RMReg.R +++ b/tensorPredictors/R/RMReg.R @@ -17,12 +17,14 @@ #' first dimension are the observations while the second and third are the #' `image' dimensions. When the data is provided as a matix it's assumed to be #' of shape \eqn{n\times p q} where each observation is the vectorid `image'. +#' @param Z additional covariate vector (can be \code{NULL} if not required. +#' For regression with intercept set \code{Z = rep(1, n)}) #' @param y univariate response vector #' @param lambda penalty term (note: range between 0 and max. signular value #' of the least squares solution gives non-trivial results) -#' @param loss loss function part of the objective function -#' @param grad.loss gradient of the loss function evaluated (required, there is -#' no support for numerical gradients) +#' @param loss loss function, part of the objective function +#' @param grad.loss gradient with respect to \eqn{B} of the loss function +#' (required, there is no support for numerical gradients) #' @param penalty penalty function with a vector of the singular values if the #' current iterate as arguments. The default function #' \code{function(sigma) sum(sigma)} is the nuclear norm penalty. @@ -31,18 +33,20 @@ #' @param step.size max. stepsize for gradient updates #' @param alpha iterative Nesterov momentum scaling values #' @param B0 initial value for optimization. Matrix of dimensions \eqn{p\times q} +#' @param beta initial value of additional covatiates coefficient for \eqn{Z} #' @param max.iter maximum number of gadient updates #' @param max.line.iter maximum number of line search iterations #' #' @export -RMReg <- function(X, y, lambda = 0, - loss = function(B, X, y) 0.5 * sum((y - X %*% c(B))^2), - grad.loss = function(B, X, y) crossprod(X %*% c(B) - y, X), +RMReg <- function(X, Z, y, lambda = 0, + loss = function(B, beta, X, Z, y) 0.5 * sum((y - Z %*% beta - X %*% c(B))^2), + grad.loss = function(B, beta, X, Z, y) crossprod(X %*% c(B) + Z %*% beta - y, X), penalty = function(sigma) sum(sigma), shape = dim(X)[-1], step.size = 1e-3, alpha = function(a, t) { (1 + sqrt(1 + (2 * a)^2)) / 2 }, B0 = array(0, dim = shape), + beta = rep(0, NCOL(Z)), max.iter = 500, max.line.iter = ceiling(log(step.size / sqrt(.Machine$double.eps), 2)) ) { @@ -54,7 +58,14 @@ RMReg <- function(X, y, lambda = 0, stopifnot(length(dim(X)) == 3) dim(X) <- c(nrow(X), prod(shape)) } - + if (missing(Z) || is.null(Z)) { + Z <- matrix(0, nrow(X), 1) + ZZiZ <- NULL + } else { + # Compute (Z' Z)^{-1} Z used to solve for beta. This is constant + # throughout and the variable name stands for "((Z' Z) Inverse) Z" + ZZiZ <- solve(crossprod(Z, Z), t(Z)) + } ### Set initial values # singular values of B1 (require only current point, not previous B0) @@ -66,15 +77,20 @@ RMReg <- function(X, y, lambda = 0, B1 <- B0 a0 <- 0 a1 <- 1 - loss1 <- loss(B1, X, y) + loss1 <- loss(B1, beta, X, Z, y) ### Repeat untill convergence - for (iter in max.iter) { + for (iter in 1:max.iter) { # Extrapolation (Nesterov Momentum) S <- B1 + ((a0 - 1) / a1) * (B1 - B0) + # Solve for beta at extrapolation point + if (!is.null(ZZiZ)) { + beta <- ZZiZ %*% (y - X %*% c(S)) + } + # Compute Nesterov Gradient of the Loss - grad <- array(grad.loss(S, X, y), dim = shape) + grad <- array(grad.loss(S, beta, X, Z, y), dim = shape) # Line Search (executed at least once) for (delta in step.size * 0.5^seq(0, max.line.iter - 1)) { @@ -94,34 +110,61 @@ RMReg <- function(X, y, lambda = 0, B.temp <- A } + # Solve for beta at (potential) next step + if (!is.null(ZZiZ)) { + beta <- ZZiZ %*% (y - X %*% c(B.temp)) + } + # Check line search break condition # h(B.temp) <= g(B.temp | S, delta) # \_ left _/ \_____ right _____/ # where g(B.temp | S, delta) is the first order approx. of the loss # l(S) + + | B - S |_F^2 / 2 delta + J(B) - left <- loss(B.temp, X, y) # + penalty(b.temp) - right <- loss(S, X, y) + sum(grad * (B1 - S)) + + left <- loss(B.temp, beta, X, Z, y) # + penalty(b.temp) + right <- loss(S, beta, X, Z, y) + sum(grad * (B1 - S)) + norm(B1 - S, 'F')^2 / (2 * delta) # + penalty(b.temp) if (left <= right) { break } } - # After gradient update enforce descent - loss.temp <- loss(B.temp, X, y) + # After gradient update enforce descent (stop if not decreasing) + loss.temp <- loss(B.temp, beta, X, Z, y) if (loss.temp + penalty(b.temp) <= loss1 + penalty(b1)) { loss1 <- loss.temp B0 <- B1 B1 <- B.temp + b1 <- b.temp } else { break } + # Stop if estimate is zero + if (all(b1 < .Machine$double.eps)) { + break + } + # Update momentum scaling a0 <- a1 a1 <- alpha(a1, iter) } - # return estimate - B1 + # ### Degrees of Freedom estimate + # df <- + + # return estimates and some additional stats + list( + B = B1, + beta = if(is.null(ZZiZ)) { NULL } else { beta }, + singular.values = b1, + iter = iter, + df = "?", + AIC = "?", + BIC = "?", + call = match.call() # invocing function call, collects params like lambda + ) } + +rmr.fit <- RMReg(X, NULL, y, lambda = 0) +rmr.fit$iter +rmr.fit$singular.values[rmr.fit$singular.values > 0] diff --git a/tensorPredictors/R/matpow.R b/tensorPredictors/R/matpow.R index bfd2907..d96aeaa 100644 --- a/tensorPredictors/R/matpow.R +++ b/tensorPredictors/R/matpow.R @@ -56,7 +56,7 @@ #' all.equal(A %*% B, t(A %*% B)) #' } #' -#' @keywords internal +#' @export matpow <- function(A, pow, tol = 1e-7) { if (nrow(A) != ncol(A)) { stop("Expected a square matix, but 'A' is ", nrow(A), " by ", ncol(A)) diff --git a/tensorPredictors/R/plot_matrix.R b/tensorPredictors/R/matrixImage.R similarity index 84% rename from tensorPredictors/R/plot_matrix.R rename to tensorPredictors/R/matrixImage.R index 46bf818..3cf4d96 100644 --- a/tensorPredictors/R/plot_matrix.R +++ b/tensorPredictors/R/matrixImage.R @@ -1,20 +1,21 @@ -#' Plots a matrix an image +#' Plots a matrix as an image #' #' @param A a matrix to be plotted #' @param main overall title for the plot +#' @param sub sub-title of the plot #' @param interpolate a logical vector (or scalar) indicating whether to apply #' linear interpolation to the image when drawing. #' @param ... further arguments passed to \code{\link{rasterImage}} #' #' @export -plot.matrix <- function(A, main = NULL, interpolate = FALSE, ...) { +matrixImage <- function(A, main = NULL, sub = NULL, interpolate = FALSE, ...) { # Scale values of `A` to [0, 1] with min mapped to 1 and max to 0. A <- (max(A) - A) / diff(range(A)) # plot raster image plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "red", - xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main) + xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main, sub = sub) # Add X-axis giving index ind <- seq(1, ncol(A), by = 1) diff --git a/tensorPredictors/R/random.R b/tensorPredictors/R/random.R index f57397f..8aa15f3 100644 --- a/tensorPredictors/R/random.R +++ b/tensorPredictors/R/random.R @@ -26,5 +26,5 @@ rmvnorm <- function(n = 1, mu = rep(0, p), sigma = diag(p)) { } # See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution - return(rep(mu, each = n) + matrix(rnorm(n * p), n) %*% chol(sigma)) + rep(mu, each = n) + matrix(rnorm(n * p), n) %*% chol(sigma) }