add: Regularized Matrix Regression (RMReg)
This commit is contained in:
parent
5f36ec21de
commit
bb22e29d34
|
@ -0,0 +1,127 @@
|
|||
#' Regularized Matrix Regression
|
||||
#'
|
||||
#' Solved the regularized problem
|
||||
#' \deqn{min h(B) = l(B) + J(B)}
|
||||
#' for a matrix \eqn{B}.
|
||||
#' where \eqn{l} is a loss function; for the GLM, we use the negative
|
||||
#' log-likelihood as the loss. \eqn{J(B) = f(\sigma(B))}, where \eqn{f} is a
|
||||
#' function of the singular values of \eqn{B}.
|
||||
#'
|
||||
#' The default parameterization is a nuclear norm penalized least squares regression.
|
||||
#'
|
||||
#' The least squares loss combined with \eqn{f(s) = \lambda \sum_i |s_i|}
|
||||
#' corresponds to the nuclear norm regularization problem.
|
||||
#'
|
||||
#' @param X the singnal data ether as a 3D tensor or a 2D matrix. In case of a
|
||||
#' 3D tensor the axis are assumed to be \eqn{n\times p\times q} meaning the
|
||||
#' first dimension are the observations while the second and third are the
|
||||
#' `image' dimensions. When the data is provided as a matix it's assumed to be
|
||||
#' of shape \eqn{n\times p q} where each observation is the vectorid `image'.
|
||||
#' @param y univariate response vector
|
||||
#' @param lambda penalty term (note: range between 0 and max. signular value
|
||||
#' of the least squares solution gives non-trivial results)
|
||||
#' @param loss loss function part of the objective function
|
||||
#' @param grad.loss gradient of the loss function evaluated (required, there is
|
||||
#' no support for numerical gradients)
|
||||
#' @param penalty penalty function with a vector of the singular values if the
|
||||
#' current iterate as arguments. The default function
|
||||
#' \code{function(sigma) sum(sigma)} is the nuclear norm penalty.
|
||||
#' @param shape Shape of the matrix valued predictors. Required iff the
|
||||
#' predictors \code{X} are provided in vectorized form, e.g. as a 2D matrix.
|
||||
#' @param step.size max. stepsize for gradient updates
|
||||
#' @param alpha iterative Nesterov momentum scaling values
|
||||
#' @param B0 initial value for optimization. Matrix of dimensions \eqn{p\times q}
|
||||
#' @param max.iter maximum number of gadient updates
|
||||
#' @param max.line.iter maximum number of line search iterations
|
||||
#'
|
||||
#' @export
|
||||
RMReg <- function(X, y, lambda = 0,
|
||||
loss = function(B, X, y) 0.5 * sum((y - X %*% c(B))^2),
|
||||
grad.loss = function(B, X, y) crossprod(X %*% c(B) - y, X),
|
||||
penalty = function(sigma) sum(sigma),
|
||||
shape = dim(X)[-1],
|
||||
step.size = 1e-3,
|
||||
alpha = function(a, t) { (1 + sqrt(1 + (2 * a)^2)) / 2 },
|
||||
B0 = array(0, dim = shape),
|
||||
max.iter = 500,
|
||||
max.line.iter = ceiling(log(step.size / sqrt(.Machine$double.eps), 2))
|
||||
) {
|
||||
### Check (prepair) params
|
||||
stopifnot(nrow(X) == length(y))
|
||||
if (!missing(shape)) {
|
||||
stopifnot(ncol(X) == prod(shape))
|
||||
} else {
|
||||
stopifnot(length(dim(X)) == 3)
|
||||
dim(X) <- c(nrow(X), prod(shape))
|
||||
}
|
||||
|
||||
|
||||
### Set initial values
|
||||
# singular values of B1 (require only current point, not previous B0)
|
||||
if (missing(B0)) {
|
||||
b1 <- rep(0, min(shape))
|
||||
} else {
|
||||
b1 <- La.svd(B0, 0, 0)$d
|
||||
}
|
||||
B1 <- B0
|
||||
a0 <- 0
|
||||
a1 <- 1
|
||||
loss1 <- loss(B1, X, y)
|
||||
|
||||
### Repeat untill convergence
|
||||
for (iter in max.iter) {
|
||||
# Extrapolation (Nesterov Momentum)
|
||||
S <- B1 + ((a0 - 1) / a1) * (B1 - B0)
|
||||
|
||||
# Compute Nesterov Gradient of the Loss
|
||||
grad <- array(grad.loss(S, X, y), dim = shape)
|
||||
|
||||
# Line Search (executed at least once)
|
||||
for (delta in step.size * 0.5^seq(0, max.line.iter - 1)) {
|
||||
# (potential) next step with delta as stepsize for gradient update
|
||||
A <- S - delta * grad
|
||||
|
||||
if (lambda > 0) {
|
||||
# SVD of (potential) next step
|
||||
svdA <- La.svd(A)
|
||||
|
||||
# Get (potential) next penalized iterate (nuclear norm version only)
|
||||
b.temp <- pmax(0, svdA$d - lambda) # Singular values of B.temp
|
||||
B.temp <- svdA$u %*% (b.temp * svdA$vt)
|
||||
} else {
|
||||
# in case of no penalization (pure least squares solution)
|
||||
b.temp <- La.svd(A, 0, 0)$d
|
||||
B.temp <- A
|
||||
}
|
||||
|
||||
# Check line search break condition
|
||||
# h(B.temp) <= g(B.temp | S, delta)
|
||||
# \_ left _/ \_____ right _____/
|
||||
# where g(B.temp | S, delta) is the first order approx. of the loss
|
||||
# l(S) + <grad l(S), B - S> + | B - S |_F^2 / 2 delta + J(B)
|
||||
left <- loss(B.temp, X, y) # + penalty(b.temp)
|
||||
right <- loss(S, X, y) + sum(grad * (B1 - S)) +
|
||||
norm(B1 - S, 'F')^2 / (2 * delta) # + penalty(b.temp)
|
||||
if (left <= right) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
# After gradient update enforce descent
|
||||
loss.temp <- loss(B.temp, X, y)
|
||||
if (loss.temp + penalty(b.temp) <= loss1 + penalty(b1)) {
|
||||
loss1 <- loss.temp
|
||||
B0 <- B1
|
||||
B1 <- B.temp
|
||||
} else {
|
||||
break
|
||||
}
|
||||
|
||||
# Update momentum scaling
|
||||
a0 <- a1
|
||||
a1 <- alpha(a1, iter)
|
||||
}
|
||||
|
||||
# return estimate
|
||||
B1
|
||||
}
|
|
@ -3,25 +3,41 @@
|
|||
#' Defined as sine of the maximum principal angle between the column spaces
|
||||
#' of the matrices
|
||||
#' max{ sin theta_i, i = 1, ..., min(d1, d2) }
|
||||
#'
|
||||
#' In case of rank(A) = rank(B) this measure is equivalent to
|
||||
#' || A A' - B B' ||_2
|
||||
#' where ||.||_2 is the spectral norm (max singular value).
|
||||
#'
|
||||
#' @param A,B matrices of size \eqn{p\times d_1} and \eqn{p\times d_2}.
|
||||
#'
|
||||
#' @export
|
||||
dist.projection <- function(A, B, is.ortho = FALSE,
|
||||
tol = sqrt(.Machine$double.eps)
|
||||
) {
|
||||
if (!is.matrix(A)) A <- as.matrix(A)
|
||||
if (!is.matrix(B)) B <- as.matrix(B)
|
||||
|
||||
if (!is.ortho) {
|
||||
qrA <- qr(A, tol)
|
||||
rankA <- qrA$rank
|
||||
A <- qr.Q(qrA)[, seq_len(qrA$rank), drop = FALSE]
|
||||
qrB <- qr(B, tol)
|
||||
rankB <- qrB$rank
|
||||
B <- qr.Q(qrB)[, seq_len(qrB$rank), drop = FALSE]
|
||||
} else {
|
||||
rankA <- ncol(A)
|
||||
rankB <- ncol(B)
|
||||
}
|
||||
|
||||
if (ncol(A) == 0L && ncol(B) == 0L) {
|
||||
0
|
||||
} else if (ncol(A) == 0L || ncol(B) == 0L) {
|
||||
1
|
||||
if (rankA == 0 || rankB == 0) {
|
||||
return(as.double(rankA != rankB))
|
||||
} else if (rankA == 1 && rankB == 1) {
|
||||
sigma.min <- min(abs(sum(A * B)), 1)
|
||||
} else {
|
||||
sin(acos(min(c(La.svd(crossprod(A, B), 0, 0)$d, 1))))
|
||||
sigma.min <- min(La.svd(crossprod(A, B), 0, 0)$d, 1)
|
||||
}
|
||||
if (sigma.min < 0.5) {
|
||||
sin(acos(sigma.min))
|
||||
} else {
|
||||
cos(asin(sigma.min))
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue