wip: Regularized Matrix Regression
This commit is contained in:
parent
bb22e29d34
commit
2ddbe0e90d
|
@ -4,8 +4,12 @@ export(CISE)
|
||||||
export(LSIR)
|
export(LSIR)
|
||||||
export(PCA2d)
|
export(PCA2d)
|
||||||
export(POI)
|
export(POI)
|
||||||
|
export(RMReg)
|
||||||
export(approx.kronecker)
|
export(approx.kronecker)
|
||||||
|
export(dist.projection)
|
||||||
export(dist.subspace)
|
export(dist.subspace)
|
||||||
|
export(matpow)
|
||||||
|
export(matrixImage)
|
||||||
export(reduce)
|
export(reduce)
|
||||||
import(stats)
|
import(stats)
|
||||||
useDynLib(tensorPredictors, .registration = TRUE)
|
useDynLib(tensorPredictors, .registration = TRUE)
|
||||||
|
|
|
@ -17,12 +17,14 @@
|
||||||
#' first dimension are the observations while the second and third are the
|
#' first dimension are the observations while the second and third are the
|
||||||
#' `image' dimensions. When the data is provided as a matix it's assumed to be
|
#' `image' dimensions. When the data is provided as a matix it's assumed to be
|
||||||
#' of shape \eqn{n\times p q} where each observation is the vectorid `image'.
|
#' of shape \eqn{n\times p q} where each observation is the vectorid `image'.
|
||||||
|
#' @param Z additional covariate vector (can be \code{NULL} if not required.
|
||||||
|
#' For regression with intercept set \code{Z = rep(1, n)})
|
||||||
#' @param y univariate response vector
|
#' @param y univariate response vector
|
||||||
#' @param lambda penalty term (note: range between 0 and max. signular value
|
#' @param lambda penalty term (note: range between 0 and max. signular value
|
||||||
#' of the least squares solution gives non-trivial results)
|
#' of the least squares solution gives non-trivial results)
|
||||||
#' @param loss loss function part of the objective function
|
#' @param loss loss function, part of the objective function
|
||||||
#' @param grad.loss gradient of the loss function evaluated (required, there is
|
#' @param grad.loss gradient with respect to \eqn{B} of the loss function
|
||||||
#' no support for numerical gradients)
|
#' (required, there is no support for numerical gradients)
|
||||||
#' @param penalty penalty function with a vector of the singular values if the
|
#' @param penalty penalty function with a vector of the singular values if the
|
||||||
#' current iterate as arguments. The default function
|
#' current iterate as arguments. The default function
|
||||||
#' \code{function(sigma) sum(sigma)} is the nuclear norm penalty.
|
#' \code{function(sigma) sum(sigma)} is the nuclear norm penalty.
|
||||||
|
@ -31,18 +33,20 @@
|
||||||
#' @param step.size max. stepsize for gradient updates
|
#' @param step.size max. stepsize for gradient updates
|
||||||
#' @param alpha iterative Nesterov momentum scaling values
|
#' @param alpha iterative Nesterov momentum scaling values
|
||||||
#' @param B0 initial value for optimization. Matrix of dimensions \eqn{p\times q}
|
#' @param B0 initial value for optimization. Matrix of dimensions \eqn{p\times q}
|
||||||
|
#' @param beta initial value of additional covatiates coefficient for \eqn{Z}
|
||||||
#' @param max.iter maximum number of gadient updates
|
#' @param max.iter maximum number of gadient updates
|
||||||
#' @param max.line.iter maximum number of line search iterations
|
#' @param max.line.iter maximum number of line search iterations
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
RMReg <- function(X, y, lambda = 0,
|
RMReg <- function(X, Z, y, lambda = 0,
|
||||||
loss = function(B, X, y) 0.5 * sum((y - X %*% c(B))^2),
|
loss = function(B, beta, X, Z, y) 0.5 * sum((y - Z %*% beta - X %*% c(B))^2),
|
||||||
grad.loss = function(B, X, y) crossprod(X %*% c(B) - y, X),
|
grad.loss = function(B, beta, X, Z, y) crossprod(X %*% c(B) + Z %*% beta - y, X),
|
||||||
penalty = function(sigma) sum(sigma),
|
penalty = function(sigma) sum(sigma),
|
||||||
shape = dim(X)[-1],
|
shape = dim(X)[-1],
|
||||||
step.size = 1e-3,
|
step.size = 1e-3,
|
||||||
alpha = function(a, t) { (1 + sqrt(1 + (2 * a)^2)) / 2 },
|
alpha = function(a, t) { (1 + sqrt(1 + (2 * a)^2)) / 2 },
|
||||||
B0 = array(0, dim = shape),
|
B0 = array(0, dim = shape),
|
||||||
|
beta = rep(0, NCOL(Z)),
|
||||||
max.iter = 500,
|
max.iter = 500,
|
||||||
max.line.iter = ceiling(log(step.size / sqrt(.Machine$double.eps), 2))
|
max.line.iter = ceiling(log(step.size / sqrt(.Machine$double.eps), 2))
|
||||||
) {
|
) {
|
||||||
|
@ -54,7 +58,14 @@ RMReg <- function(X, y, lambda = 0,
|
||||||
stopifnot(length(dim(X)) == 3)
|
stopifnot(length(dim(X)) == 3)
|
||||||
dim(X) <- c(nrow(X), prod(shape))
|
dim(X) <- c(nrow(X), prod(shape))
|
||||||
}
|
}
|
||||||
|
if (missing(Z) || is.null(Z)) {
|
||||||
|
Z <- matrix(0, nrow(X), 1)
|
||||||
|
ZZiZ <- NULL
|
||||||
|
} else {
|
||||||
|
# Compute (Z' Z)^{-1} Z used to solve for beta. This is constant
|
||||||
|
# throughout and the variable name stands for "((Z' Z) Inverse) Z"
|
||||||
|
ZZiZ <- solve(crossprod(Z, Z), t(Z))
|
||||||
|
}
|
||||||
|
|
||||||
### Set initial values
|
### Set initial values
|
||||||
# singular values of B1 (require only current point, not previous B0)
|
# singular values of B1 (require only current point, not previous B0)
|
||||||
|
@ -66,15 +77,20 @@ RMReg <- function(X, y, lambda = 0,
|
||||||
B1 <- B0
|
B1 <- B0
|
||||||
a0 <- 0
|
a0 <- 0
|
||||||
a1 <- 1
|
a1 <- 1
|
||||||
loss1 <- loss(B1, X, y)
|
loss1 <- loss(B1, beta, X, Z, y)
|
||||||
|
|
||||||
### Repeat untill convergence
|
### Repeat untill convergence
|
||||||
for (iter in max.iter) {
|
for (iter in 1:max.iter) {
|
||||||
# Extrapolation (Nesterov Momentum)
|
# Extrapolation (Nesterov Momentum)
|
||||||
S <- B1 + ((a0 - 1) / a1) * (B1 - B0)
|
S <- B1 + ((a0 - 1) / a1) * (B1 - B0)
|
||||||
|
|
||||||
|
# Solve for beta at extrapolation point
|
||||||
|
if (!is.null(ZZiZ)) {
|
||||||
|
beta <- ZZiZ %*% (y - X %*% c(S))
|
||||||
|
}
|
||||||
|
|
||||||
# Compute Nesterov Gradient of the Loss
|
# Compute Nesterov Gradient of the Loss
|
||||||
grad <- array(grad.loss(S, X, y), dim = shape)
|
grad <- array(grad.loss(S, beta, X, Z, y), dim = shape)
|
||||||
|
|
||||||
# Line Search (executed at least once)
|
# Line Search (executed at least once)
|
||||||
for (delta in step.size * 0.5^seq(0, max.line.iter - 1)) {
|
for (delta in step.size * 0.5^seq(0, max.line.iter - 1)) {
|
||||||
|
@ -94,34 +110,61 @@ RMReg <- function(X, y, lambda = 0,
|
||||||
B.temp <- A
|
B.temp <- A
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Solve for beta at (potential) next step
|
||||||
|
if (!is.null(ZZiZ)) {
|
||||||
|
beta <- ZZiZ %*% (y - X %*% c(B.temp))
|
||||||
|
}
|
||||||
|
|
||||||
# Check line search break condition
|
# Check line search break condition
|
||||||
# h(B.temp) <= g(B.temp | S, delta)
|
# h(B.temp) <= g(B.temp | S, delta)
|
||||||
# \_ left _/ \_____ right _____/
|
# \_ left _/ \_____ right _____/
|
||||||
# where g(B.temp | S, delta) is the first order approx. of the loss
|
# where g(B.temp | S, delta) is the first order approx. of the loss
|
||||||
# l(S) + <grad l(S), B - S> + | B - S |_F^2 / 2 delta + J(B)
|
# l(S) + <grad l(S), B - S> + | B - S |_F^2 / 2 delta + J(B)
|
||||||
left <- loss(B.temp, X, y) # + penalty(b.temp)
|
left <- loss(B.temp, beta, X, Z, y) # + penalty(b.temp)
|
||||||
right <- loss(S, X, y) + sum(grad * (B1 - S)) +
|
right <- loss(S, beta, X, Z, y) + sum(grad * (B1 - S)) +
|
||||||
norm(B1 - S, 'F')^2 / (2 * delta) # + penalty(b.temp)
|
norm(B1 - S, 'F')^2 / (2 * delta) # + penalty(b.temp)
|
||||||
if (left <= right) {
|
if (left <= right) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# After gradient update enforce descent
|
# After gradient update enforce descent (stop if not decreasing)
|
||||||
loss.temp <- loss(B.temp, X, y)
|
loss.temp <- loss(B.temp, beta, X, Z, y)
|
||||||
if (loss.temp + penalty(b.temp) <= loss1 + penalty(b1)) {
|
if (loss.temp + penalty(b.temp) <= loss1 + penalty(b1)) {
|
||||||
loss1 <- loss.temp
|
loss1 <- loss.temp
|
||||||
B0 <- B1
|
B0 <- B1
|
||||||
B1 <- B.temp
|
B1 <- B.temp
|
||||||
|
b1 <- b.temp
|
||||||
} else {
|
} else {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Stop if estimate is zero
|
||||||
|
if (all(b1 < .Machine$double.eps)) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
# Update momentum scaling
|
# Update momentum scaling
|
||||||
a0 <- a1
|
a0 <- a1
|
||||||
a1 <- alpha(a1, iter)
|
a1 <- alpha(a1, iter)
|
||||||
}
|
}
|
||||||
|
|
||||||
# return estimate
|
# ### Degrees of Freedom estimate
|
||||||
B1
|
# df <-
|
||||||
|
|
||||||
|
# return estimates and some additional stats
|
||||||
|
list(
|
||||||
|
B = B1,
|
||||||
|
beta = if(is.null(ZZiZ)) { NULL } else { beta },
|
||||||
|
singular.values = b1,
|
||||||
|
iter = iter,
|
||||||
|
df = "?",
|
||||||
|
AIC = "?",
|
||||||
|
BIC = "?",
|
||||||
|
call = match.call() # invocing function call, collects params like lambda
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
rmr.fit <- RMReg(X, NULL, y, lambda = 0)
|
||||||
|
rmr.fit$iter
|
||||||
|
rmr.fit$singular.values[rmr.fit$singular.values > 0]
|
||||||
|
|
|
@ -56,7 +56,7 @@
|
||||||
#' all.equal(A %*% B, t(A %*% B))
|
#' all.equal(A %*% B, t(A %*% B))
|
||||||
#' }
|
#' }
|
||||||
#'
|
#'
|
||||||
#' @keywords internal
|
#' @export
|
||||||
matpow <- function(A, pow, tol = 1e-7) {
|
matpow <- function(A, pow, tol = 1e-7) {
|
||||||
if (nrow(A) != ncol(A)) {
|
if (nrow(A) != ncol(A)) {
|
||||||
stop("Expected a square matix, but 'A' is ", nrow(A), " by ", ncol(A))
|
stop("Expected a square matix, but 'A' is ", nrow(A), " by ", ncol(A))
|
||||||
|
|
|
@ -1,20 +1,21 @@
|
||||||
#' Plots a matrix an image
|
#' Plots a matrix as an image
|
||||||
#'
|
#'
|
||||||
#' @param A a matrix to be plotted
|
#' @param A a matrix to be plotted
|
||||||
#' @param main overall title for the plot
|
#' @param main overall title for the plot
|
||||||
|
#' @param sub sub-title of the plot
|
||||||
#' @param interpolate a logical vector (or scalar) indicating whether to apply
|
#' @param interpolate a logical vector (or scalar) indicating whether to apply
|
||||||
#' linear interpolation to the image when drawing.
|
#' linear interpolation to the image when drawing.
|
||||||
#' @param ... further arguments passed to \code{\link{rasterImage}}
|
#' @param ... further arguments passed to \code{\link{rasterImage}}
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
plot.matrix <- function(A, main = NULL, interpolate = FALSE, ...) {
|
matrixImage <- function(A, main = NULL, sub = NULL, interpolate = FALSE, ...) {
|
||||||
|
|
||||||
# Scale values of `A` to [0, 1] with min mapped to 1 and max to 0.
|
# Scale values of `A` to [0, 1] with min mapped to 1 and max to 0.
|
||||||
A <- (max(A) - A) / diff(range(A))
|
A <- (max(A) - A) / diff(range(A))
|
||||||
|
|
||||||
# plot raster image
|
# plot raster image
|
||||||
plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "red",
|
plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "red",
|
||||||
xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main)
|
xlab = "", ylab = "", xaxt = "n", yaxt = "n", main = main, sub = sub)
|
||||||
|
|
||||||
# Add X-axis giving index
|
# Add X-axis giving index
|
||||||
ind <- seq(1, ncol(A), by = 1)
|
ind <- seq(1, ncol(A), by = 1)
|
|
@ -26,5 +26,5 @@ rmvnorm <- function(n = 1, mu = rep(0, p), sigma = diag(p)) {
|
||||||
}
|
}
|
||||||
|
|
||||||
# See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution
|
# See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution
|
||||||
return(rep(mu, each = n) + matrix(rnorm(n * p), n) %*% chol(sigma))
|
rep(mu, each = n) + matrix(rnorm(n * p), n) %*% chol(sigma)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue