#' Regularized Matrix Regression
#'
#' Solved the regularized problem
#'      \deqn{min h(B) = l(B) + J(B)}
#' for a matrix \eqn{B}.
#' where \eqn{l} is a loss function; for the GLM, we use the negative
#' log-likelihood as the loss. \eqn{J(B) = f(\sigma(B))}, where \eqn{f} is a
#' function of the singular values of \eqn{B}.
#'
#' The default parameterization is a nuclear norm penalized least squares regression.
#'
#' The least squares loss combined with \eqn{f(s) = \lambda \sum_i |s_i|}
#' corresponds to the nuclear norm regularization problem.
#'
#' @param X the singnal data ether as a 3D tensor or a 2D matrix. In case of a
#'  3D tensor the axis are assumed to be \eqn{n\times p\times q} meaning the
#'  first dimension are the observations while the second and third are the
#'  `image' dimensions. When the data is provided as a matix it's assumed to be
#'  of shape \eqn{n\times p q} where each observation is the vectorid `image'.
#' @param y univariate response vector
#' @param lambda penalty term (note: range between 0 and max. signular value
#'  of the least squares solution gives non-trivial results)
#' @param loss loss function part of the objective function
#' @param grad.loss gradient of the loss function evaluated (required, there is
#'  no support for numerical gradients)
#' @param penalty penalty function with a vector of the singular values if the
#'  current iterate as arguments. The default function
#'  \code{function(sigma) sum(sigma)} is the nuclear norm penalty.
#' @param shape Shape of the matrix valued predictors. Required iff the
#'  predictors \code{X} are provided in vectorized form, e.g. as a 2D matrix.
#' @param step.size max. stepsize for gradient updates
#' @param alpha iterative Nesterov momentum scaling values
#' @param B0 initial value for optimization. Matrix of dimensions \eqn{p\times q}
#' @param max.iter maximum number of gadient updates
#' @param max.line.iter maximum number of line search iterations
#'
#' @export
RMReg <- function(X, y, lambda = 0,
    loss = function(B, X, y) 0.5 * sum((y - X %*% c(B))^2),
    grad.loss = function(B, X, y) crossprod(X %*% c(B) - y, X),
    penalty = function(sigma) sum(sigma),
    shape = dim(X)[-1],
    step.size = 1e-3,
    alpha = function(a, t) { (1 + sqrt(1 + (2 * a)^2)) / 2 },
    B0 = array(0, dim = shape),
    max.iter = 500,
    max.line.iter = ceiling(log(step.size / sqrt(.Machine$double.eps), 2))
) {
    ### Check (prepair) params
    stopifnot(nrow(X) == length(y))
    if (!missing(shape)) {
        stopifnot(ncol(X) == prod(shape))
    } else {
        stopifnot(length(dim(X)) == 3)
        dim(X) <- c(nrow(X), prod(shape))
    }


    ### Set initial values
    # singular values of B1 (require only current point, not previous B0)
    if (missing(B0)) {
        b1 <- rep(0, min(shape))
    } else {
        b1 <- La.svd(B0, 0, 0)$d
    }
    B1 <- B0
    a0 <- 0
    a1 <- 1
    loss1 <- loss(B1, X, y)

    ### Repeat untill convergence
    for (iter in max.iter) {
        # Extrapolation (Nesterov Momentum)
        S <- B1 + ((a0 - 1) / a1) * (B1 - B0)

        # Compute Nesterov Gradient of the Loss
        grad <- array(grad.loss(S, X, y), dim = shape)

        # Line Search (executed at least once)
        for (delta in step.size * 0.5^seq(0, max.line.iter - 1)) {
            # (potential) next step with delta as stepsize for gradient update
            A <- S - delta * grad

            if (lambda > 0) {
                # SVD of (potential) next step
                svdA <- La.svd(A)

                # Get (potential) next penalized iterate (nuclear norm version only)
                b.temp <- pmax(0, svdA$d - lambda)  # Singular values of B.temp
                B.temp <- svdA$u %*% (b.temp * svdA$vt)
            } else {
                # in case of no penalization (pure least squares solution)
                b.temp <- La.svd(A, 0, 0)$d
                B.temp <- A
            }

            # Check line search break condition
            #   h(B.temp) <= g(B.temp | S, delta)
            #  \_ left _/    \_____ right _____/
            # where g(B.temp | S, delta) is the first order approx. of the loss
            #   l(S) + <grad l(S), B - S> + | B - S |_F^2 / 2 delta + J(B)
            left <- loss(B.temp, X, y) # + penalty(b.temp)
            right <- loss(S, X, y) + sum(grad * (B1 - S)) +
                norm(B1 - S, 'F')^2 / (2 * delta) # + penalty(b.temp)
            if (left <= right) {
                break
            }
        }

        # After gradient update enforce descent
        loss.temp <- loss(B.temp, X, y)
        if (loss.temp + penalty(b.temp) <= loss1 + penalty(b1)) {
            loss1 <- loss.temp
            B0 <- B1
            B1 <- B.temp
        } else {
            break
        }

        # Update momentum scaling
        a0 <- a1
        a1 <- alpha(a1, iter)
    }

    # return estimate
    B1
}