wip: reimplementation of KPIR
This commit is contained in:
parent
79f5e9781f
commit
fcef12540f
|
@ -1,16 +1,28 @@
|
|||
# Generated by roxygen2: do not edit by hand
|
||||
|
||||
export("%x_1%")
|
||||
export("%x_2%")
|
||||
export("%x_3%")
|
||||
export("%x_4%")
|
||||
export(CISE)
|
||||
export(LSIR)
|
||||
export(PCA2d)
|
||||
export(POI)
|
||||
export(RMReg)
|
||||
export(approx.kronecker)
|
||||
export(colKronecker)
|
||||
export(dist.projection)
|
||||
export(dist.subspace)
|
||||
export(kpir.base)
|
||||
export(kpir.kron)
|
||||
export(kpir.momentum)
|
||||
export(kpir.new)
|
||||
export(mat)
|
||||
export(matpow)
|
||||
export(matrixImage)
|
||||
export(reduce)
|
||||
export(rowKronecker)
|
||||
export(tensor_predictor)
|
||||
export(ttm)
|
||||
import(stats)
|
||||
useDynLib(tensorPredictors, .registration = TRUE)
|
||||
|
|
|
@ -135,7 +135,7 @@ RMReg <- function(X, Z, y, lambda = 0, max.iter = 500L, max.line.iter = 50L,
|
|||
}
|
||||
|
||||
# Evaluate loss to ensure descent after line search
|
||||
loss.temp <- loss(B.temp, beta.temp, X, Z, y)
|
||||
loss.temp <- left # loss(B.temp, beta.temp, X, Z, y) # already computed
|
||||
|
||||
# logging callback
|
||||
if (is.function(logger)) {
|
||||
|
@ -196,7 +196,7 @@ RMReg <- function(X, Z, y, lambda = 0, max.iter = 500L, max.line.iter = 50L,
|
|||
iter = iter,
|
||||
df = df,
|
||||
loss = loss1,
|
||||
lambda = lambda,
|
||||
lambda = delta * lambda, #
|
||||
AIC = loss1 + 2 * df, # TODO: check this!
|
||||
BIC = loss1 + log(nrow(X)) * df, # TODO: check this!
|
||||
call = match.call() # invocing function call, collects params like lambda
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
#' Column wise kronecker product
|
||||
#'
|
||||
#' This is a special case of the "Khatri-Rao Product".
|
||||
#'
|
||||
#' @seealso \link{\code{rowKronecker}}
|
||||
#'
|
||||
#' @export
|
||||
colKronecker <- function(A, B) {
|
||||
sapply(1:ncol(A), function(i) kronecker(A[, i], B[, i]))
|
||||
}
|
||||
|
||||
#' Row wise kronecker product
|
||||
#'
|
||||
#' Also known as the "Face-splitting product". This is a special case of the
|
||||
#' "Khatri-Rao Product".
|
||||
#'
|
||||
#' @seealso \link{\code{colKronecker}}
|
||||
#'
|
||||
#' @export
|
||||
rowKronecker <- function(A, B) {
|
||||
C <- vapply(seq_len(ncol(A)), function(i) A[, i] * B, B)
|
||||
dim(C) <- c(nrow(A), ncol(A) * ncol(B))
|
||||
C
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
#' (Slightly altered) old implementation
|
||||
#'
|
||||
#' @export
|
||||
kpir.base <- function(X, Fy, p, t, k = 1L, r = 1L, d1 = 1L, d2 = 1L,
|
||||
method = c("mle", "ls"),
|
||||
eps1 = 1e-10, eps2 = 1e-10, max.iter = 500L,
|
||||
logger = NULL
|
||||
) {
|
||||
|
||||
log.likelihood <- function(par, X, Fy, Delta.inv, da, db) {
|
||||
alpha <- matrix(par[1:prod(da)], da[1L])
|
||||
beta <- matrix(par[(prod(da) + 1):length(par)], db[1L])
|
||||
error <- X - tcrossprod(Fy, kronecker(alpha, beta))
|
||||
sum(error * (error %*% Delta.inv))
|
||||
}
|
||||
|
||||
# Validate method using unexact matching.
|
||||
method <- match.arg(method)
|
||||
|
||||
# ## Step 1:
|
||||
# # OLS estimate of the model `X = F_y B + epsilon`.
|
||||
# B <- t(solve(crossprod(Fy), crossprod(Fy, X)))
|
||||
|
||||
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
|
||||
cpFy <- crossprod(Fy)
|
||||
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
||||
# In case of under-determined system replace the inverse in the normal
|
||||
# equation by the Moore-Penrose Pseudo Inverse
|
||||
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
|
||||
} else {
|
||||
# Compute OLS estimate by the Normal Equation
|
||||
B <- t(solve(cpFy, crossprod(Fy, X)))
|
||||
}
|
||||
|
||||
# Estimate alpha, beta as nearest kronecker approximation.
|
||||
c(alpha, beta) %<-% approx.kronecker(B, c(t, r), c(p, k))
|
||||
|
||||
if (method == "ls") {
|
||||
# Estimate Delta.
|
||||
B <- kronecker(alpha, beta)
|
||||
rank <- if (ncol(Fy) == 1) 1L else qr(Fy)$rank
|
||||
resid <- X - tcrossprod(Fy, B)
|
||||
Delta <- crossprod(resid) / (nrow(X) - rank)
|
||||
|
||||
} else { # mle
|
||||
B <- kronecker(alpha, beta)
|
||||
|
||||
# Compute residuals
|
||||
resid <- X - tcrossprod(Fy, B)
|
||||
|
||||
# Estimate initial Delta.
|
||||
Delta <- crossprod(resid) / nrow(X)
|
||||
|
||||
# call logger with initial starting value
|
||||
if (is.function(logger)) {
|
||||
# Transformed Residuals (using `matpow` as robust inversion algo,
|
||||
# uses Moore-Penrose Pseudo Inverse in case of singular `Delta`)
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
loss <- 0.5 * (nrow(X) * log(det(Delta)) + sum(resid.trans * resid))
|
||||
logger(0L, loss, alpha, beta, Delta, NA)
|
||||
}
|
||||
|
||||
for (iter in 1:max.iter) {
|
||||
# Optimize log-likelihood for alpha, beta with fixed Delta.
|
||||
opt <- optim(c(alpha, beta), log.likelihood, gr = NULL,
|
||||
X, Fy, matpow(Delta, -1), c(t, r), c(p, k))
|
||||
# Store previous alpha, beta and Delta (for break consition).
|
||||
Delta.last <- Delta
|
||||
B.last <- B
|
||||
# Extract optimized alpha, beta.
|
||||
alpha <- matrix(opt$par[1:(t * r)], t, r)
|
||||
beta <- matrix(opt$par[(t * r + 1):length(opt$par)], p, k)
|
||||
# Calc new Delta with likelihood optimized alpha, beta.
|
||||
B <- kronecker(alpha, beta)
|
||||
resid <- X - tcrossprod(Fy, B)
|
||||
Delta <- crossprod(resid) / nrow(X)
|
||||
|
||||
# call logger before break condition check
|
||||
if (is.function(logger)) {
|
||||
# Transformed Residuals (using `matpow` as robust inversion algo,
|
||||
# uses Moore-Penrose Pseudo Inverse in case of singular `Delta`)
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
loss <- 0.5 * (nrow(X) * log(det(Delta)) + sum(resid.trans * resid))
|
||||
logger(iter, loss, alpha, beta, Delta, NA)
|
||||
}
|
||||
|
||||
# Check break condition 1.
|
||||
if (norm(Delta - Delta.last, 'F') < eps1 * norm(Delta, 'F')) {
|
||||
# Check break condition 2.
|
||||
if (norm(B - B.last, 'F') < eps2 * norm(B, 'F')) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# calc final loss
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
loss <- 0.5 * (nrow(X) * log(det(Delta)) + sum(resid.trans * resid))
|
||||
|
||||
list(loss = loss, alpha = alpha, beta = beta, Delta = Delta)
|
||||
}
|
|
@ -0,0 +1,211 @@
|
|||
#' Gradient Descent Bases Tensor Predictors method with Nesterov Accelerated
|
||||
#' Momentum and Kronecker structure assumption for the residual covariance
|
||||
#' `Delta = Delta.1 %x% Delta.2` (simple plugin version!)
|
||||
#'
|
||||
#' @export
|
||||
kpir.kron <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
|
||||
nesterov.scaling = function(a, t) { 0.5 * (1 + sqrt(1 + (2 * a)^2)) },
|
||||
eps = .Machine$double.eps,
|
||||
logger = NULL
|
||||
) {
|
||||
|
||||
# Check if X and Fy have same number of observations
|
||||
stopifnot(nrow(X) == NROW(Fy))
|
||||
n <- nrow(X) # Number of observations
|
||||
|
||||
# Get and check predictor dimensions (convert to 3-tensor if needed)
|
||||
if (length(dim(X)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(X) == prod(shape[1:2]))
|
||||
p <- as.integer(shape[1]) # Predictor "height"
|
||||
q <- as.integer(shape[2]) # Predictor "width"
|
||||
} else if (length(dim(X)) == 3L) {
|
||||
p <- dim(X)[2]
|
||||
q <- dim(X)[3]
|
||||
} else {
|
||||
stop("'X' must be a matrix or 3-tensor")
|
||||
}
|
||||
|
||||
# Get and check response dimensions (and convert to 3-tensor if needed)
|
||||
if (!is.array(Fy)) {
|
||||
Fy <- as.array(Fy)
|
||||
}
|
||||
if (length(dim(Fy)) == 1L) {
|
||||
k <- r <- 1L
|
||||
dim(Fy) <- c(n, 1L, 1L)
|
||||
} else if (length(dim(Fy)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(Fy) == prod(shape[3:4]))
|
||||
k <- as.integer(shape[3]) # Response functional "height"
|
||||
r <- as.integer(shape[4]) # Response functional "width"
|
||||
} else if (length(dim(Fy)) == 3L) {
|
||||
k <- dim(Fy)[2]
|
||||
r <- dim(Fy)[3]
|
||||
} else {
|
||||
stop("'Fy' must be a vector, matrix or 3-tensor")
|
||||
}
|
||||
|
||||
|
||||
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
|
||||
# Vectorize
|
||||
dim(Fy) <- c(n, k * r)
|
||||
dim(X) <- c(n, p * q)
|
||||
# Solve
|
||||
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
|
||||
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
||||
# In case of under-determined system replace the inverse in the normal
|
||||
# equation by the Moore-Penrose Pseudo Inverse
|
||||
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
|
||||
} else {
|
||||
# Compute OLS estimate by the Normal Equation
|
||||
B <- t(solve(cpFy, crossprod(Fy, X)))
|
||||
}
|
||||
|
||||
# De-Vectroize (from now on tensor arithmetics)
|
||||
dim(Fy) <- c(n, k, r)
|
||||
dim(X) <- c(n, p, q)
|
||||
|
||||
# Decompose `B = alpha x beta` into `alpha` and `beta`
|
||||
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
|
||||
|
||||
# Compute residuals
|
||||
resid <- X - (Fy %x_3% alpha0 %x_2% beta0)
|
||||
|
||||
# Covariance estimate
|
||||
Delta.1 <- tcrossprod(mat(resid, 3))
|
||||
Delta.2 <- tcrossprod(mat(resid, 2))
|
||||
tr <- sum(diag(Delta.1))
|
||||
Delta.1 <- Delta.1 / sqrt(n * tr)
|
||||
Delta.2 <- Delta.2 / sqrt(n * tr)
|
||||
|
||||
# Transformed Residuals
|
||||
resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2)
|
||||
|
||||
# Evaluate negative log-likelihood
|
||||
loss <- 0.5 * (n * (p * log(det(Delta.1)) + q * log(det(Delta.2))) +
|
||||
sum(resid.trans * resid))
|
||||
|
||||
# Call history callback (logger) before the first iterate
|
||||
if (is.function(logger)) {
|
||||
logger(0L, loss, alpha0, beta0, Delta.1, Delta.2, NA)
|
||||
}
|
||||
|
||||
|
||||
### Step 2: MLE with LS solution as starting value
|
||||
a0 <- 0
|
||||
a1 <- 1
|
||||
alpha1 <- alpha0
|
||||
beta1 <- beta0
|
||||
|
||||
# main descent loop
|
||||
no.nesterov <- TRUE
|
||||
break.reason <- NA
|
||||
for (iter in seq_len(max.iter)) {
|
||||
if (no.nesterov) {
|
||||
# without extrapolation as fallback
|
||||
S.alpha <- alpha1
|
||||
S.beta <- beta1
|
||||
} else {
|
||||
# extrapolation using previous direction
|
||||
S.alpha <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0)
|
||||
S.beta <- beta1 + ((a0 - 1) / a1) * ( beta1 - beta0)
|
||||
}
|
||||
|
||||
# Extrapolated Residuals
|
||||
resid <- X - (Fy %x_3% S.alpha %x_2% S.beta)
|
||||
|
||||
# Covariance Estimates
|
||||
Delta.1 <- tcrossprod(mat(resid, 3))
|
||||
Delta.2 <- tcrossprod(mat(resid, 2))
|
||||
tr <- sum(diag(Delta.1))
|
||||
Delta.1 <- Delta.1 / sqrt(n * tr)
|
||||
Delta.2 <- Delta.2 / sqrt(n * tr)
|
||||
|
||||
# Transform Residuals
|
||||
resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2)
|
||||
|
||||
# Calculate Gradients
|
||||
grad.alpha <- tcrossprod(mat(resid.trans, 3), mat(Fy %x_2% beta, 3))
|
||||
grad.beta <- tcrossprod(mat(resid.trans, 2), mat(Fy %x_3% alpha, 2))
|
||||
|
||||
# Backtracking line search (Armijo type)
|
||||
# The `inner.prod` is used in the Armijo break condition but does not
|
||||
# depend on the step size.
|
||||
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
|
||||
|
||||
# backtracking loop
|
||||
for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) {
|
||||
# Update `alpha` and `beta` (note: add(+), the gradients are already
|
||||
# pointing into the negative slope direction of the loss cause they are
|
||||
# the gradients of the log-likelihood [NOT the negative log-likelihood])
|
||||
alpha.temp <- S.alpha + delta * grad.alpha
|
||||
beta.temp <- S.beta + delta * grad.beta
|
||||
|
||||
# Update Residuals, Covariance and transformed Residuals
|
||||
resid <- X - (Fy %x_3% alpha.temp %x_2% beta.temp)
|
||||
Delta.1 <- tcrossprod(mat(resid, 3))
|
||||
Delta.2 <- tcrossprod(mat(resid, 2))
|
||||
tr <- sum(diag(Delta.1))
|
||||
Delta.1 <- Delta.1 / sqrt(n * tr)
|
||||
Delta.2 <- Delta.2 / sqrt(n * tr)
|
||||
resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2)
|
||||
|
||||
# Evaluate negative log-likelihood
|
||||
loss.temp <- 0.5 * (n * (p * log(det(Delta.1)) + q * log(det(Delta.2)))
|
||||
+ sum(resid.trans * resid))
|
||||
|
||||
# Armijo line search break condition
|
||||
if (loss.temp <= loss - 0.1 * delta * inner.prod) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
# Call logger (invoce history callback)
|
||||
if (is.function(logger)) {
|
||||
logger(iter, loss.temp, alpha.temp, beta.temp, Delta.1, Delta.2, delta)
|
||||
}
|
||||
|
||||
# Ensure descent
|
||||
if (loss.temp < loss) {
|
||||
alpha0 <- alpha1
|
||||
alpha1 <- alpha.temp
|
||||
beta0 <- beta1
|
||||
beta1 <- beta.temp
|
||||
|
||||
# check break conditions (in descent case)
|
||||
if (mean(abs(alpha1)) + mean(abs(beta1)) < eps) {
|
||||
break.reason <- "alpha, beta numerically zero"
|
||||
break # basically, estimates are zero -> stop
|
||||
}
|
||||
if (inner.prod < eps * (p * q + r * k)) {
|
||||
break.reason <- "mean squared gradient is smaller than epsilon"
|
||||
break # mean squared gradient is smaller than epsilon -> stop
|
||||
}
|
||||
if (abs(loss.temp - loss) < eps) {
|
||||
break.reason <- "decrease is too small (slow)"
|
||||
break # decrease is too small (slow) -> stop
|
||||
}
|
||||
|
||||
loss <- loss.temp
|
||||
no.nesterov <- FALSE # always reset
|
||||
} else if (!no.nesterov) {
|
||||
no.nesterov <- TRUE # retry without momentum
|
||||
next
|
||||
} else {
|
||||
break.reason <- "failed even without momentum"
|
||||
break # failed even without momentum -> stop
|
||||
}
|
||||
|
||||
# update momentum scaling
|
||||
a0 <- a1
|
||||
a1 <- nesterov.scaling(a1, iter)
|
||||
|
||||
# Set next iter starting step.size to line searched step size
|
||||
# (while allowing it to encrease)
|
||||
step.size <- 1.618034 * delta
|
||||
|
||||
}
|
||||
|
||||
list(loss = loss, alpha = alpha1, beta = beta1, Delta = Delta, break.reason = break.reason)
|
||||
}
|
|
@ -0,0 +1,192 @@
|
|||
#' Gradient Descent Bases Tensor Predictors method with Nesterov Accelerated
|
||||
#' Momentum
|
||||
#'
|
||||
#' @export
|
||||
kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
|
||||
nesterov.scaling = function(a, t) { 0.5 * (1 + sqrt(1 + (2 * a)^2)) },
|
||||
eps = .Machine$double.eps,
|
||||
logger = NULL
|
||||
) {
|
||||
|
||||
# Check if X and Fy have same number of observations
|
||||
stopifnot(nrow(X) == NROW(Fy))
|
||||
n <- nrow(X) # Number of observations
|
||||
|
||||
# Get and check predictor dimensions
|
||||
if (length(dim(X)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(X) == prod(shape[1:2]))
|
||||
p <- as.integer(shape[1]) # Predictor "height"
|
||||
q <- as.integer(shape[2]) # Predictor "width"
|
||||
} else if (length(dim(X)) == 3L) {
|
||||
p <- dim(X)[2]
|
||||
q <- dim(X)[3]
|
||||
dim(X) <- c(n, p * q)
|
||||
} else {
|
||||
stop("'X' must be a matrix or 3-tensor")
|
||||
}
|
||||
|
||||
# Get and check response dimensions
|
||||
if (!is.array(Fy)) {
|
||||
Fy <- as.array(Fy)
|
||||
}
|
||||
if (length(dim(Fy)) == 1L) {
|
||||
k <- r <- 1L
|
||||
dim(Fy) <- c(n, 1L)
|
||||
} else if (length(dim(Fy)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(Fy) == prod(shape[3:4]))
|
||||
k <- as.integer(shape[3]) # Response functional "height"
|
||||
r <- as.integer(shape[4]) # Response functional "width"
|
||||
} else if (length(dim(Fy)) == 3L) {
|
||||
k <- dim(Fy)[2]
|
||||
r <- dim(Fy)[3]
|
||||
dim(Fy) <- c(n, k * r)
|
||||
} else {
|
||||
stop("'Fy' must be a vector, matrix or 3-tensor")
|
||||
}
|
||||
|
||||
|
||||
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
|
||||
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
|
||||
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
||||
# In case of under-determined system replace the inverse in the normal
|
||||
# equation by the Moore-Penrose Pseudo Inverse
|
||||
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
|
||||
} else {
|
||||
# Compute OLS estimate by the Normal Equation
|
||||
B <- t(solve(cpFy, crossprod(Fy, X)))
|
||||
}
|
||||
|
||||
# Decompose `B = alpha x beta` into `alpha` and `beta`
|
||||
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
|
||||
|
||||
# Compute residuals
|
||||
resid <- X - tcrossprod(Fy, kronecker(alpha0, beta0))
|
||||
|
||||
# Covariance estimate
|
||||
Delta <- crossprod(resid) / n
|
||||
|
||||
# Transformed Residuals (using `matpow` as robust inversion algo,
|
||||
# uses Moore-Penrose Pseudo Inverse in case of singular `Delta`)
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
|
||||
# Evaluate negative log-likelihood
|
||||
loss <- 0.5 * (n * log(det(Delta)) + sum(resid.trans * resid))
|
||||
|
||||
# Call history callback (logger) before the first iterate
|
||||
if (is.function(logger)) {
|
||||
logger(0L, loss, alpha0, beta0, Delta, NA)
|
||||
}
|
||||
|
||||
|
||||
### Step 2: MLE with LS solution as starting value
|
||||
a0 <- 0
|
||||
a1 <- 1
|
||||
alpha1 <- alpha0
|
||||
beta1 <- beta0
|
||||
|
||||
# main descent loop
|
||||
no.nesterov <- TRUE
|
||||
for (iter in seq_len(max.iter)) {
|
||||
if (no.nesterov) {
|
||||
# without extrapolation as fallback
|
||||
S.alpha <- alpha1
|
||||
S.beta <- beta1
|
||||
} else {
|
||||
# extrapolation using previous direction
|
||||
S.alpha <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0)
|
||||
S.beta <- beta1 + ((a0 - 1) / a1) * ( beta1 - beta0)
|
||||
}
|
||||
|
||||
# Extrapolated Residuals, Covariance and transformed Residuals
|
||||
resid <- X - tcrossprod(Fy, kronecker(S.alpha, S.beta))
|
||||
Delta <- crossprod(resid) / n
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
|
||||
# Sum over kronecker prod by observation (Face-Splitting Product)
|
||||
KR <- colSums(rowKronecker(Fy, resid.trans))
|
||||
dim(KR) <- c(p, q, k, r)
|
||||
|
||||
# (Nesterov) `alpha` Gradient
|
||||
R.alpha <- aperm(KR, c(2, 4, 1, 3))
|
||||
dim(R.alpha) <- c(q * r, p * k)
|
||||
grad.alpha <- c(R.alpha %*% c(S.beta))
|
||||
|
||||
# (Nesterov) `beta` Gradient
|
||||
R.beta <- aperm(KR, c(1, 3, 2, 4))
|
||||
dim(R.beta) <- c(p * k, q * r)
|
||||
grad.beta <- c(R.beta %*% c(S.alpha))
|
||||
|
||||
# Backtracking line search (Armijo type)
|
||||
# The `inner.prod` is used in the Armijo break condition but does not
|
||||
# depend on the step size.
|
||||
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
|
||||
|
||||
# backtracking loop
|
||||
for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) {
|
||||
# Update `alpha` and `beta` (note: add(+), the gradients are already
|
||||
# pointing into the negative slope direction of the loss cause they are
|
||||
# the gradients of the log-likelihood [NOT the negative log-likelihood])
|
||||
alpha.temp <- S.alpha + delta * grad.alpha
|
||||
beta.temp <- S.beta + delta * grad.beta
|
||||
|
||||
# Update Residuals, Covariance and transformed Residuals
|
||||
resid <- X - tcrossprod(Fy, kronecker(alpha.temp, beta.temp))
|
||||
Delta <- crossprod(resid) / n
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
|
||||
# Evaluate negative log-likelihood
|
||||
loss.temp <- 0.5 * (n * log(det(Delta)) + sum(resid.trans * resid))
|
||||
|
||||
# Armijo line search break condition
|
||||
if (loss.temp <= loss - 0.1 * delta * inner.prod) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
# Call logger (invoce history callback)
|
||||
if (is.function(logger)) {
|
||||
logger(iter, loss.temp, alpha.temp, beta.temp, Delta, delta)
|
||||
}
|
||||
|
||||
# Ensure descent
|
||||
if (loss.temp < loss) {
|
||||
alpha0 <- alpha1
|
||||
alpha1 <- alpha.temp
|
||||
beta0 <- beta1
|
||||
beta1 <- beta.temp
|
||||
|
||||
# check break conditions (in descent case)
|
||||
if (mean(abs(alpha1)) + mean(abs(beta1)) < eps) {
|
||||
break # basically, estimates are zero -> stop
|
||||
}
|
||||
if (inner.prod < eps * (p * q + r * k)) {
|
||||
break # mean squared gradient is smaller than epsilon -> stop
|
||||
}
|
||||
if (abs(loss.temp - loss) < eps) {
|
||||
break # decrease is too small (slow) -> stop
|
||||
}
|
||||
|
||||
loss <- loss.temp
|
||||
no.nesterov <- FALSE # always reset
|
||||
} else if (!no.nesterov) {
|
||||
no.nesterov <- TRUE # retry without momentum
|
||||
next
|
||||
} else {
|
||||
break # failed even without momentum -> stop
|
||||
}
|
||||
|
||||
# update momentum scaling
|
||||
a0 <- a1
|
||||
a1 <- nesterov.scaling(a1, iter)
|
||||
|
||||
# Set next iter starting step.size to line searched step size
|
||||
# (while allowing it to encrease)
|
||||
step.size <- 1.618034 * delta
|
||||
|
||||
}
|
||||
|
||||
list(loss = loss, alpha = alpha1, beta = beta1, Delta = Delta)
|
||||
}
|
|
@ -0,0 +1,157 @@
|
|||
#' Gradient Descent Bases Tensor Predictors method
|
||||
#'
|
||||
#' @export
|
||||
kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
|
||||
eps = .Machine$double.eps,
|
||||
logger = NULL
|
||||
) {
|
||||
|
||||
# Check if X and Fy have same number of observations
|
||||
stopifnot(nrow(X) == NROW(Fy))
|
||||
n <- nrow(X) # Number of observations
|
||||
|
||||
# Get and check predictor dimensions
|
||||
if (length(dim(X)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(X) == prod(shape[1:2]))
|
||||
p <- as.integer(shape[1]) # Predictor "height"
|
||||
q <- as.integer(shape[2]) # Predictor "width"
|
||||
} else if (length(dim(X)) == 3L) {
|
||||
p <- dim(X)[2]
|
||||
q <- dim(X)[3]
|
||||
dim(X) <- c(n, p * q)
|
||||
} else {
|
||||
stop("'X' must be a matrix or 3-tensor")
|
||||
}
|
||||
|
||||
# Get and check response dimensions
|
||||
if (!is.array(Fy)) {
|
||||
Fy <- as.array(Fy)
|
||||
}
|
||||
if (length(dim(Fy)) == 1L) {
|
||||
k <- r <- 1L
|
||||
dim(Fy) <- c(n, 1L)
|
||||
} else if (length(dim(Fy)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(Fy) == prod(shape[3:4]))
|
||||
k <- as.integer(shape[3]) # Response functional "height"
|
||||
r <- as.integer(shape[4]) # Response functional "width"
|
||||
} else if (length(dim(Fy)) == 3L) {
|
||||
k <- dim(Fy)[2]
|
||||
r <- dim(Fy)[3]
|
||||
dim(Fy) <- c(n, k * r)
|
||||
} else {
|
||||
stop("'Fy' must be a vector, matrix or 3-tensor")
|
||||
}
|
||||
|
||||
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
|
||||
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
|
||||
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
||||
# In case of under-determined system replace the inverse in the normal
|
||||
# equation by the Moore-Penrose Pseudo Inverse
|
||||
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
|
||||
} else {
|
||||
# Compute OLS estimate by the Normal Equation
|
||||
B <- t(solve(cpFy, crossprod(Fy, X)))
|
||||
}
|
||||
|
||||
# Decompose `B = alpha x beta` into `alpha` and `beta`
|
||||
c(alpha, beta) %<-% approx.kronecker(B, c(q, r), c(p, k))
|
||||
|
||||
# Compute residuals
|
||||
resid <- X - tcrossprod(Fy, kronecker(alpha, beta))
|
||||
|
||||
# Covariance estimate
|
||||
Delta <- crossprod(resid) / n
|
||||
|
||||
# Transformed Residuals (using `matpow` as robust inversion algo,
|
||||
# uses Moore-Penrose Pseudo Inverse in case of singular `Delta`)
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
|
||||
# Evaluate negative log-likelihood
|
||||
loss <- 0.5 * (n * log(det(Delta)) + sum(resid.trans * resid))
|
||||
|
||||
# Call history callback (logger) before the first iterate
|
||||
if (is.function(logger)) {
|
||||
logger(0L, loss, alpha, beta, Delta, NA)
|
||||
}
|
||||
|
||||
|
||||
### Step 2: MLE with LS solution as starting value
|
||||
for (iter in seq_len(max.iter)) {
|
||||
# Sum over kronecker prod by observation (Face-Splitting Product)
|
||||
KR <- colSums(rowKronecker(Fy, resid.trans))
|
||||
dim(KR) <- c(p, q, k, r)
|
||||
|
||||
# `alpha` Gradient
|
||||
R.Alpha <- aperm(KR, c(2, 4, 1, 3))
|
||||
dim(R.Alpha) <- c(q * r, p * k)
|
||||
grad.alpha <- c(R.Alpha %*% c(beta))
|
||||
|
||||
# `beta` Gradient
|
||||
R.Beta <- aperm(KR, c(1, 3, 2, 4))
|
||||
dim(R.Beta) <- c(p * k, q * r)
|
||||
grad.beta <- c(R.Beta %*% c(alpha))
|
||||
|
||||
# Line Search (Armijo type)
|
||||
# The `inner.prod` is used in the Armijo break condition but does not
|
||||
# depend on the step size.
|
||||
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
|
||||
|
||||
# Line Search loop
|
||||
for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) {
|
||||
# Update `alpha` and `beta` (note: add(+), the gradients are already
|
||||
# pointing into the negative slope direction of the loss cause they are
|
||||
# the gradients of the log-likelihood [NOT the negative log-likelihood])
|
||||
alpha.temp <- alpha + delta * grad.alpha
|
||||
beta.temp <- beta + delta * grad.beta
|
||||
|
||||
# Update Residuals, Covariance and transformed Residuals
|
||||
resid <- X - tcrossprod(Fy, kronecker(alpha.temp, beta.temp))
|
||||
Delta <- crossprod(resid) / n
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
|
||||
# Evaluate negative log-likelihood
|
||||
loss.temp <- 0.5 * (n * log(det(Delta)) + sum(resid.trans * resid))
|
||||
|
||||
# Armijo line search break condition
|
||||
if (loss.temp <= loss - 0.1 * delta * inner.prod) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
# Call logger (invoce history callback)
|
||||
if (is.function(logger)) {
|
||||
logger(iter, loss.temp, alpha.temp, beta.temp, Delta, delta)
|
||||
}
|
||||
|
||||
# Ensure descent
|
||||
if (loss.temp < loss) {
|
||||
alpha <- alpha.temp
|
||||
beta <- beta.temp
|
||||
|
||||
# check break conditions (in descent case)
|
||||
if (mean(abs(alpha)) + mean(abs(beta)) < eps) {
|
||||
break # basically, estimates are zero -> stop
|
||||
}
|
||||
if (inner.prod < eps * (p * q + r * k)) {
|
||||
break # mean squared gradient is smaller than epsilon -> stop
|
||||
}
|
||||
if (abs(loss.temp - loss) < eps) {
|
||||
break # decrease is too small (slow) -> stop
|
||||
}
|
||||
|
||||
loss <- loss.temp
|
||||
} else {
|
||||
break
|
||||
}
|
||||
|
||||
# Set next iter starting step.size to line searched step size
|
||||
# (while allowing it to encrease)
|
||||
step.size <- 1.618034 * delta
|
||||
|
||||
}
|
||||
|
||||
list(loss = loss, alpha = alpha, beta = beta, Delta = Delta)
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
#' Matricization
|
||||
#'
|
||||
#' @param T multi-dimensional array of order at least \code{mode}
|
||||
#' @param mode dimension along to matricize
|
||||
#'
|
||||
#' @returns matrix of dimensions \code{dim(T)[mode]} by \code{prod(dim(T))[-mode]}
|
||||
#'
|
||||
#' @export
|
||||
mat <- function(T, mode) {
|
||||
mode <- as.integer(mode)
|
||||
|
||||
dims <- dim(T)
|
||||
if (length(dims) < mode) {
|
||||
stop("Mode must be a pos. int. smaller equal than the tensor order")
|
||||
}
|
||||
|
||||
if (mode > 1L) {
|
||||
T <- aperm(T, c(mode, seq_along(dims)[-mode]))
|
||||
}
|
||||
dim(T) <- c(dims[mode], prod(dims[-mode]))
|
||||
|
||||
T
|
||||
}
|
|
@ -15,7 +15,8 @@
|
|||
#'
|
||||
#' @export
|
||||
matrixImage <- function(A, add.values = FALSE,
|
||||
main = NULL, sub = NULL, interpolate = FALSE, ...
|
||||
main = NULL, sub = NULL, interpolate = FALSE, ...,
|
||||
digits = getOption("digits")
|
||||
) {
|
||||
# plot raster image
|
||||
plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "red",
|
||||
|
@ -38,8 +39,11 @@ matrixImage <- function(A, add.values = FALSE,
|
|||
|
||||
# Writes matrix values (in colored element grids)
|
||||
if (any(add.values)) {
|
||||
if (length(add.values) > 1) { A[!add.values] <- NA }
|
||||
text(rep(x - 0.5, nrow(A)), rep(rev(y - 0.5), each = ncol(A)), A,
|
||||
if (length(add.values) > 1) {
|
||||
A[!add.values] <- NA
|
||||
A[add.values] <- format(A[add.values], digits = digits)
|
||||
}
|
||||
text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A,
|
||||
adj = 0.5, col = as.integer(S > 0.65))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#' Multi-Value assigne operator.
|
||||
#' Multi-Value assign operator.
|
||||
#'
|
||||
#' @param lhs vector of variables (or variable names) to assign values.
|
||||
#' @param rhs object that can be coersed to list of the same length as
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
#' Tensor Times Matrix (n-mode tensor matrix product)
|
||||
#'
|
||||
#' @param T array of order at least \code{mode}
|
||||
#' @param M matrix, the right hand side of the mode product such that
|
||||
#' \code{ncol(M)} equals \code{dim(T)[mode]}
|
||||
#' @param mode the mode of the product in the range \code{1:length(dim(T))}
|
||||
#'
|
||||
#' @returns multi-dimensional array of the same order as \code{T} with the
|
||||
#' \code{mode} dimension equal to \code{nrow(M)}
|
||||
#'
|
||||
#' @export
|
||||
ttm <- function(T, M, mode = length(dim(T))) {
|
||||
mode <- as.integer(mode)
|
||||
dims <- dim(T)
|
||||
|
||||
if (length(dims) < mode) {
|
||||
stop(sprintf("Mode (%d) must be smaller equal the tensor order %d",
|
||||
mode, length(dims)))
|
||||
}
|
||||
if (dims[mode] != ncol(M)) {
|
||||
stop(sprintf("Dim. missmatch, mode %d has dim %d but ncol is %d.",
|
||||
mode, dims[mode], ncol(M)))
|
||||
}
|
||||
|
||||
# Special case of mode being equal to tensor order, then an alternative
|
||||
# (but more efficient) version is Z M' where Z is only the reshaped but
|
||||
# no permutation of elements is required (as in the case of mode 1)
|
||||
if (mode == length(dims)) {
|
||||
# Convert tensor to matrix (similar to matricization)
|
||||
dim(T) <- c(prod(dims[-mode]), dims[mode])
|
||||
|
||||
# Equiv matrix product
|
||||
C <- tcrossprod(T, M)
|
||||
|
||||
# Shape back to tensor
|
||||
dim(C) <- c(dims[-mode], nrow(M))
|
||||
|
||||
C
|
||||
} else {
|
||||
# Matricize tensor T
|
||||
if (mode != 1L) {
|
||||
perm <- c(mode, seq_along(dims)[-mode])
|
||||
T <- aperm(T, perm)
|
||||
}
|
||||
dim(T) <- c(dims[mode], prod(dims[-mode]))
|
||||
|
||||
# Perform equivalent matrix multiplication
|
||||
C <- M %*% T
|
||||
|
||||
# Reshape and rearrange matricized result back to a tensor
|
||||
dim(C) <- c(nrow(M), dims[-mode])
|
||||
if (mode == 1L) {
|
||||
C
|
||||
} else {
|
||||
aperm(C, order(perm))
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#' @rdname ttm
|
||||
#' @export
|
||||
`%x_1%` <- function(T, M) ttm(T, M, 1L)
|
||||
#' @rdname ttm
|
||||
#' @export
|
||||
`%x_2%` <- function(T, M) ttm(T, M, 2L)
|
||||
#' @rdname ttm
|
||||
#' @export
|
||||
`%x_3%` <- function(T, M) ttm(T, M, 3L)
|
||||
#' @rdname ttm
|
||||
#' @export
|
||||
`%x_4%` <- function(T, M) ttm(T, M, 4L)
|
|
@ -0,0 +1,78 @@
|
|||
#' Kronecker decomposed Variance Matrix estimation.
|
||||
#'
|
||||
#' @description Computes the kronecker decomposition factors of the variance
|
||||
#' matrix
|
||||
#' \deqn{\var{X} = tr(L)tr(R) (L\otimes R).}
|
||||
#'
|
||||
#' @param X numeric matrix or 3d array.
|
||||
#' @param shape in case of \code{X} being a matrix, this specifies the predictor
|
||||
#' shape, otherwise ignored.
|
||||
#' @param center boolean specifying if \code{X} is centered before computing the
|
||||
#' left/right second moments. This is usefull in the case of allready centered
|
||||
#' data.
|
||||
#'
|
||||
#' @returns List containing
|
||||
#' \describe{
|
||||
#' \item{lhs}{Left Hand Side \eqn{L} of the kronecker decomposed variance matrix}
|
||||
#' \item{rhs}{Right Hand Side \eqn{R} of the kronecker decomposed variance matrix}
|
||||
#' \item{trace}{Scaling factor \eqn{tr(L)tr(R)} for the variance matrix}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' n <- 503L # nr. of observations
|
||||
#' p <- 32L # first predictor dimension
|
||||
#' q <- 27L # second predictor dimension
|
||||
#' lhs <- 0.5^abs(outer(seq_len(q), seq_len(q), `-`)) # Left Var components
|
||||
#' rhs <- 0.5^abs(outer(seq_len(p), seq_len(p), `-`)) # Right Var components
|
||||
#' X <- rmvnorm(n, sigma = kronecker(lhs, rhs)) # Multivariate normal data
|
||||
#'
|
||||
#' # Estimate kronecker decomposed variance matrix
|
||||
#' dim(X) # c(n, p * q)
|
||||
#' fit <- var.kronecker(X, shape = c(p, q))
|
||||
#'
|
||||
#' # equivalent
|
||||
#' dim(X) <- c(n, p, q)
|
||||
#' fit <- var.kronecker(X)
|
||||
#'
|
||||
#' # Compute complete estimated variance matrix
|
||||
#' varX.hat <- fit$trace^-1 * kronecker(fit$lhs, fit$rhs)
|
||||
#'
|
||||
#' # or its inverse
|
||||
#' varXinv.hat <- fit$trace * kronecker(solve(fit$lhs), solve(fit$rhs))
|
||||
#'
|
||||
var.kronecker <- function(X, shape = dim(X)[-1], center = TRUE) {
|
||||
# Get and check predictor dimensions
|
||||
n <- nrow(X)
|
||||
if (length(dim(X)) == 2L) {
|
||||
stopifnot(ncol(X) == prod(shape[1:2]))
|
||||
p <- as.integer(shape[1]) # Predictor "height"
|
||||
q <- as.integer(shape[2]) # Predictor "width"
|
||||
dim(X) <- c(n, p, q)
|
||||
} else if (length(dim(X)) == 3L) {
|
||||
p <- dim(X)[2]
|
||||
q <- dim(X)[3]
|
||||
} else {
|
||||
stop("'X' must be a matrix or 3-tensor")
|
||||
}
|
||||
|
||||
if (isTRUE(center)) {
|
||||
# Center X; X[, i, j] <- X[, i, j] - mean(X[, i, j])
|
||||
X <- scale(X, scale = FALSE)
|
||||
|
||||
print(range(attr(X, "scaled:center")))
|
||||
|
||||
dim(X) <- c(n, p, q)
|
||||
}
|
||||
|
||||
# Calc left/right side of kronecker structures covariance
|
||||
# var(X) = var.lhs %x% var.rhs
|
||||
var.lhs <- .rowMeans(apply(X, 1, crossprod), q * q, n)
|
||||
dim(var.lhs) <- c(q, q)
|
||||
var.rhs <- .rowMeans(apply(X, 1, tcrossprod), p * p, n)
|
||||
dim(var.rhs) <- c(p, p)
|
||||
|
||||
# Estimate scalling factor tr(var(X)) = tr(var.lhs) tr(var.rhs)
|
||||
trace <- sum(X^2) / n
|
||||
|
||||
list(lhs = var.lhs, rhs = var.rhs, trace = trace)
|
||||
}
|
Loading…
Reference in New Issue