diff --git a/tensorPredictors/NAMESPACE b/tensorPredictors/NAMESPACE index 51f28dc..5cff5c2 100644 --- a/tensorPredictors/NAMESPACE +++ b/tensorPredictors/NAMESPACE @@ -1,16 +1,28 @@ # Generated by roxygen2: do not edit by hand +export("%x_1%") +export("%x_2%") +export("%x_3%") +export("%x_4%") export(CISE) export(LSIR) export(PCA2d) export(POI) export(RMReg) export(approx.kronecker) +export(colKronecker) export(dist.projection) export(dist.subspace) +export(kpir.base) +export(kpir.kron) +export(kpir.momentum) +export(kpir.new) +export(mat) export(matpow) export(matrixImage) export(reduce) +export(rowKronecker) export(tensor_predictor) +export(ttm) import(stats) useDynLib(tensorPredictors, .registration = TRUE) diff --git a/tensorPredictors/R/RMReg.R b/tensorPredictors/R/RMReg.R index 35208f1..269e524 100644 --- a/tensorPredictors/R/RMReg.R +++ b/tensorPredictors/R/RMReg.R @@ -135,7 +135,7 @@ RMReg <- function(X, Z, y, lambda = 0, max.iter = 500L, max.line.iter = 50L, } # Evaluate loss to ensure descent after line search - loss.temp <- loss(B.temp, beta.temp, X, Z, y) + loss.temp <- left # loss(B.temp, beta.temp, X, Z, y) # already computed # logging callback if (is.function(logger)) { @@ -196,7 +196,7 @@ RMReg <- function(X, Z, y, lambda = 0, max.iter = 500L, max.line.iter = 50L, iter = iter, df = df, loss = loss1, - lambda = lambda, + lambda = delta * lambda, # AIC = loss1 + 2 * df, # TODO: check this! BIC = loss1 + log(nrow(X)) * df, # TODO: check this! call = match.call() # invocing function call, collects params like lambda diff --git a/tensorPredictors/R/face_splitting_product.R b/tensorPredictors/R/face_splitting_product.R new file mode 100644 index 0000000..7a8dd64 --- /dev/null +++ b/tensorPredictors/R/face_splitting_product.R @@ -0,0 +1,24 @@ +#' Column wise kronecker product +#' +#' This is a special case of the "Khatri-Rao Product". +#' +#' @seealso \link{\code{rowKronecker}} +#' +#' @export +colKronecker <- function(A, B) { + sapply(1:ncol(A), function(i) kronecker(A[, i], B[, i])) +} + +#' Row wise kronecker product +#' +#' Also known as the "Face-splitting product". This is a special case of the +#' "Khatri-Rao Product". +#' +#' @seealso \link{\code{colKronecker}} +#' +#' @export +rowKronecker <- function(A, B) { + C <- vapply(seq_len(ncol(A)), function(i) A[, i] * B, B) + dim(C) <- c(nrow(A), ncol(A) * ncol(B)) + C +} diff --git a/tensorPredictors/R/kpir_base.R b/tensorPredictors/R/kpir_base.R new file mode 100644 index 0000000..3a3673b --- /dev/null +++ b/tensorPredictors/R/kpir_base.R @@ -0,0 +1,102 @@ +#' (Slightly altered) old implementation +#' +#' @export +kpir.base <- function(X, Fy, p, t, k = 1L, r = 1L, d1 = 1L, d2 = 1L, + method = c("mle", "ls"), + eps1 = 1e-10, eps2 = 1e-10, max.iter = 500L, + logger = NULL +) { + + log.likelihood <- function(par, X, Fy, Delta.inv, da, db) { + alpha <- matrix(par[1:prod(da)], da[1L]) + beta <- matrix(par[(prod(da) + 1):length(par)], db[1L]) + error <- X - tcrossprod(Fy, kronecker(alpha, beta)) + sum(error * (error %*% Delta.inv)) + } + + # Validate method using unexact matching. + method <- match.arg(method) + + # ## Step 1: + # # OLS estimate of the model `X = F_y B + epsilon`. + # B <- t(solve(crossprod(Fy), crossprod(Fy, X))) + + ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon` + cpFy <- crossprod(Fy) + if (n <= k * r || qr(cpFy)$rank < k * r) { + # In case of under-determined system replace the inverse in the normal + # equation by the Moore-Penrose Pseudo Inverse + B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X)) + } else { + # Compute OLS estimate by the Normal Equation + B <- t(solve(cpFy, crossprod(Fy, X))) + } + + # Estimate alpha, beta as nearest kronecker approximation. + c(alpha, beta) %<-% approx.kronecker(B, c(t, r), c(p, k)) + + if (method == "ls") { + # Estimate Delta. + B <- kronecker(alpha, beta) + rank <- if (ncol(Fy) == 1) 1L else qr(Fy)$rank + resid <- X - tcrossprod(Fy, B) + Delta <- crossprod(resid) / (nrow(X) - rank) + + } else { # mle + B <- kronecker(alpha, beta) + + # Compute residuals + resid <- X - tcrossprod(Fy, B) + + # Estimate initial Delta. + Delta <- crossprod(resid) / nrow(X) + + # call logger with initial starting value + if (is.function(logger)) { + # Transformed Residuals (using `matpow` as robust inversion algo, + # uses Moore-Penrose Pseudo Inverse in case of singular `Delta`) + resid.trans <- resid %*% matpow(Delta, -1) + loss <- 0.5 * (nrow(X) * log(det(Delta)) + sum(resid.trans * resid)) + logger(0L, loss, alpha, beta, Delta, NA) + } + + for (iter in 1:max.iter) { + # Optimize log-likelihood for alpha, beta with fixed Delta. + opt <- optim(c(alpha, beta), log.likelihood, gr = NULL, + X, Fy, matpow(Delta, -1), c(t, r), c(p, k)) + # Store previous alpha, beta and Delta (for break consition). + Delta.last <- Delta + B.last <- B + # Extract optimized alpha, beta. + alpha <- matrix(opt$par[1:(t * r)], t, r) + beta <- matrix(opt$par[(t * r + 1):length(opt$par)], p, k) + # Calc new Delta with likelihood optimized alpha, beta. + B <- kronecker(alpha, beta) + resid <- X - tcrossprod(Fy, B) + Delta <- crossprod(resid) / nrow(X) + + # call logger before break condition check + if (is.function(logger)) { + # Transformed Residuals (using `matpow` as robust inversion algo, + # uses Moore-Penrose Pseudo Inverse in case of singular `Delta`) + resid.trans <- resid %*% matpow(Delta, -1) + loss <- 0.5 * (nrow(X) * log(det(Delta)) + sum(resid.trans * resid)) + logger(iter, loss, alpha, beta, Delta, NA) + } + + # Check break condition 1. + if (norm(Delta - Delta.last, 'F') < eps1 * norm(Delta, 'F')) { + # Check break condition 2. + if (norm(B - B.last, 'F') < eps2 * norm(B, 'F')) { + break + } + } + } + } + + # calc final loss + resid.trans <- resid %*% matpow(Delta, -1) + loss <- 0.5 * (nrow(X) * log(det(Delta)) + sum(resid.trans * resid)) + + list(loss = loss, alpha = alpha, beta = beta, Delta = Delta) +} diff --git a/tensorPredictors/R/kpir_kron.R b/tensorPredictors/R/kpir_kron.R new file mode 100644 index 0000000..bbdf5ff --- /dev/null +++ b/tensorPredictors/R/kpir_kron.R @@ -0,0 +1,211 @@ +#' Gradient Descent Bases Tensor Predictors method with Nesterov Accelerated +#' Momentum and Kronecker structure assumption for the residual covariance +#' `Delta = Delta.1 %x% Delta.2` (simple plugin version!) +#' +#' @export +kpir.kron <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), + max.iter = 500L, max.line.iter = 50L, step.size = 1e-3, + nesterov.scaling = function(a, t) { 0.5 * (1 + sqrt(1 + (2 * a)^2)) }, + eps = .Machine$double.eps, + logger = NULL +) { + + # Check if X and Fy have same number of observations + stopifnot(nrow(X) == NROW(Fy)) + n <- nrow(X) # Number of observations + + # Get and check predictor dimensions (convert to 3-tensor if needed) + if (length(dim(X)) == 2L) { + stopifnot(!missing(shape)) + stopifnot(ncol(X) == prod(shape[1:2])) + p <- as.integer(shape[1]) # Predictor "height" + q <- as.integer(shape[2]) # Predictor "width" + } else if (length(dim(X)) == 3L) { + p <- dim(X)[2] + q <- dim(X)[3] + } else { + stop("'X' must be a matrix or 3-tensor") + } + + # Get and check response dimensions (and convert to 3-tensor if needed) + if (!is.array(Fy)) { + Fy <- as.array(Fy) + } + if (length(dim(Fy)) == 1L) { + k <- r <- 1L + dim(Fy) <- c(n, 1L, 1L) + } else if (length(dim(Fy)) == 2L) { + stopifnot(!missing(shape)) + stopifnot(ncol(Fy) == prod(shape[3:4])) + k <- as.integer(shape[3]) # Response functional "height" + r <- as.integer(shape[4]) # Response functional "width" + } else if (length(dim(Fy)) == 3L) { + k <- dim(Fy)[2] + r <- dim(Fy)[3] + } else { + stop("'Fy' must be a vector, matrix or 3-tensor") + } + + + ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon` + # Vectorize + dim(Fy) <- c(n, k * r) + dim(X) <- c(n, p * q) + # Solve + cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace + if (n <= k * r || qr(cpFy)$rank < k * r) { + # In case of under-determined system replace the inverse in the normal + # equation by the Moore-Penrose Pseudo Inverse + B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X)) + } else { + # Compute OLS estimate by the Normal Equation + B <- t(solve(cpFy, crossprod(Fy, X))) + } + + # De-Vectroize (from now on tensor arithmetics) + dim(Fy) <- c(n, k, r) + dim(X) <- c(n, p, q) + + # Decompose `B = alpha x beta` into `alpha` and `beta` + c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k)) + + # Compute residuals + resid <- X - (Fy %x_3% alpha0 %x_2% beta0) + + # Covariance estimate + Delta.1 <- tcrossprod(mat(resid, 3)) + Delta.2 <- tcrossprod(mat(resid, 2)) + tr <- sum(diag(Delta.1)) + Delta.1 <- Delta.1 / sqrt(n * tr) + Delta.2 <- Delta.2 / sqrt(n * tr) + + # Transformed Residuals + resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2) + + # Evaluate negative log-likelihood + loss <- 0.5 * (n * (p * log(det(Delta.1)) + q * log(det(Delta.2))) + + sum(resid.trans * resid)) + + # Call history callback (logger) before the first iterate + if (is.function(logger)) { + logger(0L, loss, alpha0, beta0, Delta.1, Delta.2, NA) + } + + + ### Step 2: MLE with LS solution as starting value + a0 <- 0 + a1 <- 1 + alpha1 <- alpha0 + beta1 <- beta0 + + # main descent loop + no.nesterov <- TRUE + break.reason <- NA + for (iter in seq_len(max.iter)) { + if (no.nesterov) { + # without extrapolation as fallback + S.alpha <- alpha1 + S.beta <- beta1 + } else { + # extrapolation using previous direction + S.alpha <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0) + S.beta <- beta1 + ((a0 - 1) / a1) * ( beta1 - beta0) + } + + # Extrapolated Residuals + resid <- X - (Fy %x_3% S.alpha %x_2% S.beta) + + # Covariance Estimates + Delta.1 <- tcrossprod(mat(resid, 3)) + Delta.2 <- tcrossprod(mat(resid, 2)) + tr <- sum(diag(Delta.1)) + Delta.1 <- Delta.1 / sqrt(n * tr) + Delta.2 <- Delta.2 / sqrt(n * tr) + + # Transform Residuals + resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2) + + # Calculate Gradients + grad.alpha <- tcrossprod(mat(resid.trans, 3), mat(Fy %x_2% beta, 3)) + grad.beta <- tcrossprod(mat(resid.trans, 2), mat(Fy %x_3% alpha, 2)) + + # Backtracking line search (Armijo type) + # The `inner.prod` is used in the Armijo break condition but does not + # depend on the step size. + inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2) + + # backtracking loop + for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) { + # Update `alpha` and `beta` (note: add(+), the gradients are already + # pointing into the negative slope direction of the loss cause they are + # the gradients of the log-likelihood [NOT the negative log-likelihood]) + alpha.temp <- S.alpha + delta * grad.alpha + beta.temp <- S.beta + delta * grad.beta + + # Update Residuals, Covariance and transformed Residuals + resid <- X - (Fy %x_3% alpha.temp %x_2% beta.temp) + Delta.1 <- tcrossprod(mat(resid, 3)) + Delta.2 <- tcrossprod(mat(resid, 2)) + tr <- sum(diag(Delta.1)) + Delta.1 <- Delta.1 / sqrt(n * tr) + Delta.2 <- Delta.2 / sqrt(n * tr) + resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2) + + # Evaluate negative log-likelihood + loss.temp <- 0.5 * (n * (p * log(det(Delta.1)) + q * log(det(Delta.2))) + + sum(resid.trans * resid)) + + # Armijo line search break condition + if (loss.temp <= loss - 0.1 * delta * inner.prod) { + break + } + } + + # Call logger (invoce history callback) + if (is.function(logger)) { + logger(iter, loss.temp, alpha.temp, beta.temp, Delta.1, Delta.2, delta) + } + + # Ensure descent + if (loss.temp < loss) { + alpha0 <- alpha1 + alpha1 <- alpha.temp + beta0 <- beta1 + beta1 <- beta.temp + + # check break conditions (in descent case) + if (mean(abs(alpha1)) + mean(abs(beta1)) < eps) { + break.reason <- "alpha, beta numerically zero" + break # basically, estimates are zero -> stop + } + if (inner.prod < eps * (p * q + r * k)) { + break.reason <- "mean squared gradient is smaller than epsilon" + break # mean squared gradient is smaller than epsilon -> stop + } + if (abs(loss.temp - loss) < eps) { + break.reason <- "decrease is too small (slow)" + break # decrease is too small (slow) -> stop + } + + loss <- loss.temp + no.nesterov <- FALSE # always reset + } else if (!no.nesterov) { + no.nesterov <- TRUE # retry without momentum + next + } else { + break.reason <- "failed even without momentum" + break # failed even without momentum -> stop + } + + # update momentum scaling + a0 <- a1 + a1 <- nesterov.scaling(a1, iter) + + # Set next iter starting step.size to line searched step size + # (while allowing it to encrease) + step.size <- 1.618034 * delta + + } + + list(loss = loss, alpha = alpha1, beta = beta1, Delta = Delta, break.reason = break.reason) +} diff --git a/tensorPredictors/R/kpir_momentum.R b/tensorPredictors/R/kpir_momentum.R new file mode 100644 index 0000000..3ac5319 --- /dev/null +++ b/tensorPredictors/R/kpir_momentum.R @@ -0,0 +1,192 @@ +#' Gradient Descent Bases Tensor Predictors method with Nesterov Accelerated +#' Momentum +#' +#' @export +kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), + max.iter = 500L, max.line.iter = 50L, step.size = 1e-3, + nesterov.scaling = function(a, t) { 0.5 * (1 + sqrt(1 + (2 * a)^2)) }, + eps = .Machine$double.eps, + logger = NULL +) { + + # Check if X and Fy have same number of observations + stopifnot(nrow(X) == NROW(Fy)) + n <- nrow(X) # Number of observations + + # Get and check predictor dimensions + if (length(dim(X)) == 2L) { + stopifnot(!missing(shape)) + stopifnot(ncol(X) == prod(shape[1:2])) + p <- as.integer(shape[1]) # Predictor "height" + q <- as.integer(shape[2]) # Predictor "width" + } else if (length(dim(X)) == 3L) { + p <- dim(X)[2] + q <- dim(X)[3] + dim(X) <- c(n, p * q) + } else { + stop("'X' must be a matrix or 3-tensor") + } + + # Get and check response dimensions + if (!is.array(Fy)) { + Fy <- as.array(Fy) + } + if (length(dim(Fy)) == 1L) { + k <- r <- 1L + dim(Fy) <- c(n, 1L) + } else if (length(dim(Fy)) == 2L) { + stopifnot(!missing(shape)) + stopifnot(ncol(Fy) == prod(shape[3:4])) + k <- as.integer(shape[3]) # Response functional "height" + r <- as.integer(shape[4]) # Response functional "width" + } else if (length(dim(Fy)) == 3L) { + k <- dim(Fy)[2] + r <- dim(Fy)[3] + dim(Fy) <- c(n, k * r) + } else { + stop("'Fy' must be a vector, matrix or 3-tensor") + } + + + ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon` + cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace + if (n <= k * r || qr(cpFy)$rank < k * r) { + # In case of under-determined system replace the inverse in the normal + # equation by the Moore-Penrose Pseudo Inverse + B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X)) + } else { + # Compute OLS estimate by the Normal Equation + B <- t(solve(cpFy, crossprod(Fy, X))) + } + + # Decompose `B = alpha x beta` into `alpha` and `beta` + c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k)) + + # Compute residuals + resid <- X - tcrossprod(Fy, kronecker(alpha0, beta0)) + + # Covariance estimate + Delta <- crossprod(resid) / n + + # Transformed Residuals (using `matpow` as robust inversion algo, + # uses Moore-Penrose Pseudo Inverse in case of singular `Delta`) + resid.trans <- resid %*% matpow(Delta, -1) + + # Evaluate negative log-likelihood + loss <- 0.5 * (n * log(det(Delta)) + sum(resid.trans * resid)) + + # Call history callback (logger) before the first iterate + if (is.function(logger)) { + logger(0L, loss, alpha0, beta0, Delta, NA) + } + + + ### Step 2: MLE with LS solution as starting value + a0 <- 0 + a1 <- 1 + alpha1 <- alpha0 + beta1 <- beta0 + + # main descent loop + no.nesterov <- TRUE + for (iter in seq_len(max.iter)) { + if (no.nesterov) { + # without extrapolation as fallback + S.alpha <- alpha1 + S.beta <- beta1 + } else { + # extrapolation using previous direction + S.alpha <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0) + S.beta <- beta1 + ((a0 - 1) / a1) * ( beta1 - beta0) + } + + # Extrapolated Residuals, Covariance and transformed Residuals + resid <- X - tcrossprod(Fy, kronecker(S.alpha, S.beta)) + Delta <- crossprod(resid) / n + resid.trans <- resid %*% matpow(Delta, -1) + + # Sum over kronecker prod by observation (Face-Splitting Product) + KR <- colSums(rowKronecker(Fy, resid.trans)) + dim(KR) <- c(p, q, k, r) + + # (Nesterov) `alpha` Gradient + R.alpha <- aperm(KR, c(2, 4, 1, 3)) + dim(R.alpha) <- c(q * r, p * k) + grad.alpha <- c(R.alpha %*% c(S.beta)) + + # (Nesterov) `beta` Gradient + R.beta <- aperm(KR, c(1, 3, 2, 4)) + dim(R.beta) <- c(p * k, q * r) + grad.beta <- c(R.beta %*% c(S.alpha)) + + # Backtracking line search (Armijo type) + # The `inner.prod` is used in the Armijo break condition but does not + # depend on the step size. + inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2) + + # backtracking loop + for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) { + # Update `alpha` and `beta` (note: add(+), the gradients are already + # pointing into the negative slope direction of the loss cause they are + # the gradients of the log-likelihood [NOT the negative log-likelihood]) + alpha.temp <- S.alpha + delta * grad.alpha + beta.temp <- S.beta + delta * grad.beta + + # Update Residuals, Covariance and transformed Residuals + resid <- X - tcrossprod(Fy, kronecker(alpha.temp, beta.temp)) + Delta <- crossprod(resid) / n + resid.trans <- resid %*% matpow(Delta, -1) + + # Evaluate negative log-likelihood + loss.temp <- 0.5 * (n * log(det(Delta)) + sum(resid.trans * resid)) + + # Armijo line search break condition + if (loss.temp <= loss - 0.1 * delta * inner.prod) { + break + } + } + + # Call logger (invoce history callback) + if (is.function(logger)) { + logger(iter, loss.temp, alpha.temp, beta.temp, Delta, delta) + } + + # Ensure descent + if (loss.temp < loss) { + alpha0 <- alpha1 + alpha1 <- alpha.temp + beta0 <- beta1 + beta1 <- beta.temp + + # check break conditions (in descent case) + if (mean(abs(alpha1)) + mean(abs(beta1)) < eps) { + break # basically, estimates are zero -> stop + } + if (inner.prod < eps * (p * q + r * k)) { + break # mean squared gradient is smaller than epsilon -> stop + } + if (abs(loss.temp - loss) < eps) { + break # decrease is too small (slow) -> stop + } + + loss <- loss.temp + no.nesterov <- FALSE # always reset + } else if (!no.nesterov) { + no.nesterov <- TRUE # retry without momentum + next + } else { + break # failed even without momentum -> stop + } + + # update momentum scaling + a0 <- a1 + a1 <- nesterov.scaling(a1, iter) + + # Set next iter starting step.size to line searched step size + # (while allowing it to encrease) + step.size <- 1.618034 * delta + + } + + list(loss = loss, alpha = alpha1, beta = beta1, Delta = Delta) +} diff --git a/tensorPredictors/R/kpir_new.R b/tensorPredictors/R/kpir_new.R new file mode 100644 index 0000000..e1bdfff --- /dev/null +++ b/tensorPredictors/R/kpir_new.R @@ -0,0 +1,157 @@ +#' Gradient Descent Bases Tensor Predictors method +#' +#' @export +kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), + max.iter = 500L, max.line.iter = 50L, step.size = 1e-3, + eps = .Machine$double.eps, + logger = NULL +) { + + # Check if X and Fy have same number of observations + stopifnot(nrow(X) == NROW(Fy)) + n <- nrow(X) # Number of observations + + # Get and check predictor dimensions + if (length(dim(X)) == 2L) { + stopifnot(!missing(shape)) + stopifnot(ncol(X) == prod(shape[1:2])) + p <- as.integer(shape[1]) # Predictor "height" + q <- as.integer(shape[2]) # Predictor "width" + } else if (length(dim(X)) == 3L) { + p <- dim(X)[2] + q <- dim(X)[3] + dim(X) <- c(n, p * q) + } else { + stop("'X' must be a matrix or 3-tensor") + } + + # Get and check response dimensions + if (!is.array(Fy)) { + Fy <- as.array(Fy) + } + if (length(dim(Fy)) == 1L) { + k <- r <- 1L + dim(Fy) <- c(n, 1L) + } else if (length(dim(Fy)) == 2L) { + stopifnot(!missing(shape)) + stopifnot(ncol(Fy) == prod(shape[3:4])) + k <- as.integer(shape[3]) # Response functional "height" + r <- as.integer(shape[4]) # Response functional "width" + } else if (length(dim(Fy)) == 3L) { + k <- dim(Fy)[2] + r <- dim(Fy)[3] + dim(Fy) <- c(n, k * r) + } else { + stop("'Fy' must be a vector, matrix or 3-tensor") + } + + ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon` + cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace + if (n <= k * r || qr(cpFy)$rank < k * r) { + # In case of under-determined system replace the inverse in the normal + # equation by the Moore-Penrose Pseudo Inverse + B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X)) + } else { + # Compute OLS estimate by the Normal Equation + B <- t(solve(cpFy, crossprod(Fy, X))) + } + + # Decompose `B = alpha x beta` into `alpha` and `beta` + c(alpha, beta) %<-% approx.kronecker(B, c(q, r), c(p, k)) + + # Compute residuals + resid <- X - tcrossprod(Fy, kronecker(alpha, beta)) + + # Covariance estimate + Delta <- crossprod(resid) / n + + # Transformed Residuals (using `matpow` as robust inversion algo, + # uses Moore-Penrose Pseudo Inverse in case of singular `Delta`) + resid.trans <- resid %*% matpow(Delta, -1) + + # Evaluate negative log-likelihood + loss <- 0.5 * (n * log(det(Delta)) + sum(resid.trans * resid)) + + # Call history callback (logger) before the first iterate + if (is.function(logger)) { + logger(0L, loss, alpha, beta, Delta, NA) + } + + + ### Step 2: MLE with LS solution as starting value + for (iter in seq_len(max.iter)) { + # Sum over kronecker prod by observation (Face-Splitting Product) + KR <- colSums(rowKronecker(Fy, resid.trans)) + dim(KR) <- c(p, q, k, r) + + # `alpha` Gradient + R.Alpha <- aperm(KR, c(2, 4, 1, 3)) + dim(R.Alpha) <- c(q * r, p * k) + grad.alpha <- c(R.Alpha %*% c(beta)) + + # `beta` Gradient + R.Beta <- aperm(KR, c(1, 3, 2, 4)) + dim(R.Beta) <- c(p * k, q * r) + grad.beta <- c(R.Beta %*% c(alpha)) + + # Line Search (Armijo type) + # The `inner.prod` is used in the Armijo break condition but does not + # depend on the step size. + inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2) + + # Line Search loop + for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) { + # Update `alpha` and `beta` (note: add(+), the gradients are already + # pointing into the negative slope direction of the loss cause they are + # the gradients of the log-likelihood [NOT the negative log-likelihood]) + alpha.temp <- alpha + delta * grad.alpha + beta.temp <- beta + delta * grad.beta + + # Update Residuals, Covariance and transformed Residuals + resid <- X - tcrossprod(Fy, kronecker(alpha.temp, beta.temp)) + Delta <- crossprod(resid) / n + resid.trans <- resid %*% matpow(Delta, -1) + + # Evaluate negative log-likelihood + loss.temp <- 0.5 * (n * log(det(Delta)) + sum(resid.trans * resid)) + + # Armijo line search break condition + if (loss.temp <= loss - 0.1 * delta * inner.prod) { + break + } + } + + # Call logger (invoce history callback) + if (is.function(logger)) { + logger(iter, loss.temp, alpha.temp, beta.temp, Delta, delta) + } + + # Ensure descent + if (loss.temp < loss) { + alpha <- alpha.temp + beta <- beta.temp + + # check break conditions (in descent case) + if (mean(abs(alpha)) + mean(abs(beta)) < eps) { + break # basically, estimates are zero -> stop + } + if (inner.prod < eps * (p * q + r * k)) { + break # mean squared gradient is smaller than epsilon -> stop + } + if (abs(loss.temp - loss) < eps) { + break # decrease is too small (slow) -> stop + } + + loss <- loss.temp + } else { + break + } + + # Set next iter starting step.size to line searched step size + # (while allowing it to encrease) + step.size <- 1.618034 * delta + + } + + list(loss = loss, alpha = alpha, beta = beta, Delta = Delta) +} diff --git a/tensorPredictors/R/matricize.R b/tensorPredictors/R/matricize.R new file mode 100644 index 0000000..cb91e4d --- /dev/null +++ b/tensorPredictors/R/matricize.R @@ -0,0 +1,23 @@ +#' Matricization +#' +#' @param T multi-dimensional array of order at least \code{mode} +#' @param mode dimension along to matricize +#' +#' @returns matrix of dimensions \code{dim(T)[mode]} by \code{prod(dim(T))[-mode]} +#' +#' @export +mat <- function(T, mode) { + mode <- as.integer(mode) + + dims <- dim(T) + if (length(dims) < mode) { + stop("Mode must be a pos. int. smaller equal than the tensor order") + } + + if (mode > 1L) { + T <- aperm(T, c(mode, seq_along(dims)[-mode])) + } + dim(T) <- c(dims[mode], prod(dims[-mode])) + + T +} diff --git a/tensorPredictors/R/matrixImage.R b/tensorPredictors/R/matrixImage.R index ed257a2..6c6a8eb 100644 --- a/tensorPredictors/R/matrixImage.R +++ b/tensorPredictors/R/matrixImage.R @@ -15,7 +15,8 @@ #' #' @export matrixImage <- function(A, add.values = FALSE, - main = NULL, sub = NULL, interpolate = FALSE, ... + main = NULL, sub = NULL, interpolate = FALSE, ..., + digits = getOption("digits") ) { # plot raster image plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "red", @@ -38,8 +39,11 @@ matrixImage <- function(A, add.values = FALSE, # Writes matrix values (in colored element grids) if (any(add.values)) { - if (length(add.values) > 1) { A[!add.values] <- NA } - text(rep(x - 0.5, nrow(A)), rep(rev(y - 0.5), each = ncol(A)), A, + if (length(add.values) > 1) { + A[!add.values] <- NA + A[add.values] <- format(A[add.values], digits = digits) + } + text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A, adj = 0.5, col = as.integer(S > 0.65)) } } diff --git a/tensorPredictors/R/multi_assign.R b/tensorPredictors/R/multi_assign.R index 9909688..873d04f 100644 --- a/tensorPredictors/R/multi_assign.R +++ b/tensorPredictors/R/multi_assign.R @@ -1,4 +1,4 @@ -#' Multi-Value assigne operator. +#' Multi-Value assign operator. #' #' @param lhs vector of variables (or variable names) to assign values. #' @param rhs object that can be coersed to list of the same length as diff --git a/tensorPredictors/R/tensor_times_matrix.R b/tensorPredictors/R/tensor_times_matrix.R new file mode 100644 index 0000000..6b7af26 --- /dev/null +++ b/tensorPredictors/R/tensor_times_matrix.R @@ -0,0 +1,72 @@ +#' Tensor Times Matrix (n-mode tensor matrix product) +#' +#' @param T array of order at least \code{mode} +#' @param M matrix, the right hand side of the mode product such that +#' \code{ncol(M)} equals \code{dim(T)[mode]} +#' @param mode the mode of the product in the range \code{1:length(dim(T))} +#' +#' @returns multi-dimensional array of the same order as \code{T} with the +#' \code{mode} dimension equal to \code{nrow(M)} +#' +#' @export +ttm <- function(T, M, mode = length(dim(T))) { + mode <- as.integer(mode) + dims <- dim(T) + + if (length(dims) < mode) { + stop(sprintf("Mode (%d) must be smaller equal the tensor order %d", + mode, length(dims))) + } + if (dims[mode] != ncol(M)) { + stop(sprintf("Dim. missmatch, mode %d has dim %d but ncol is %d.", + mode, dims[mode], ncol(M))) + } + + # Special case of mode being equal to tensor order, then an alternative + # (but more efficient) version is Z M' where Z is only the reshaped but + # no permutation of elements is required (as in the case of mode 1) + if (mode == length(dims)) { + # Convert tensor to matrix (similar to matricization) + dim(T) <- c(prod(dims[-mode]), dims[mode]) + + # Equiv matrix product + C <- tcrossprod(T, M) + + # Shape back to tensor + dim(C) <- c(dims[-mode], nrow(M)) + + C + } else { + # Matricize tensor T + if (mode != 1L) { + perm <- c(mode, seq_along(dims)[-mode]) + T <- aperm(T, perm) + } + dim(T) <- c(dims[mode], prod(dims[-mode])) + + # Perform equivalent matrix multiplication + C <- M %*% T + + # Reshape and rearrange matricized result back to a tensor + dim(C) <- c(nrow(M), dims[-mode]) + if (mode == 1L) { + C + } else { + aperm(C, order(perm)) + } + } + +} + +#' @rdname ttm +#' @export +`%x_1%` <- function(T, M) ttm(T, M, 1L) +#' @rdname ttm +#' @export +`%x_2%` <- function(T, M) ttm(T, M, 2L) +#' @rdname ttm +#' @export +`%x_3%` <- function(T, M) ttm(T, M, 3L) +#' @rdname ttm +#' @export +`%x_4%` <- function(T, M) ttm(T, M, 4L) diff --git a/tensorPredictors/R/var_kronecker.R b/tensorPredictors/R/var_kronecker.R new file mode 100644 index 0000000..1ca2564 --- /dev/null +++ b/tensorPredictors/R/var_kronecker.R @@ -0,0 +1,78 @@ +#' Kronecker decomposed Variance Matrix estimation. +#' +#' @description Computes the kronecker decomposition factors of the variance +#' matrix +#' \deqn{\var{X} = tr(L)tr(R) (L\otimes R).} +#' +#' @param X numeric matrix or 3d array. +#' @param shape in case of \code{X} being a matrix, this specifies the predictor +#' shape, otherwise ignored. +#' @param center boolean specifying if \code{X} is centered before computing the +#' left/right second moments. This is usefull in the case of allready centered +#' data. +#' +#' @returns List containing +#' \describe{ +#' \item{lhs}{Left Hand Side \eqn{L} of the kronecker decomposed variance matrix} +#' \item{rhs}{Right Hand Side \eqn{R} of the kronecker decomposed variance matrix} +#' \item{trace}{Scaling factor \eqn{tr(L)tr(R)} for the variance matrix} +#' } +#' +#' @examples +#' n <- 503L # nr. of observations +#' p <- 32L # first predictor dimension +#' q <- 27L # second predictor dimension +#' lhs <- 0.5^abs(outer(seq_len(q), seq_len(q), `-`)) # Left Var components +#' rhs <- 0.5^abs(outer(seq_len(p), seq_len(p), `-`)) # Right Var components +#' X <- rmvnorm(n, sigma = kronecker(lhs, rhs)) # Multivariate normal data +#' +#' # Estimate kronecker decomposed variance matrix +#' dim(X) # c(n, p * q) +#' fit <- var.kronecker(X, shape = c(p, q)) +#' +#' # equivalent +#' dim(X) <- c(n, p, q) +#' fit <- var.kronecker(X) +#' +#' # Compute complete estimated variance matrix +#' varX.hat <- fit$trace^-1 * kronecker(fit$lhs, fit$rhs) +#' +#' # or its inverse +#' varXinv.hat <- fit$trace * kronecker(solve(fit$lhs), solve(fit$rhs)) +#' +var.kronecker <- function(X, shape = dim(X)[-1], center = TRUE) { + # Get and check predictor dimensions + n <- nrow(X) + if (length(dim(X)) == 2L) { + stopifnot(ncol(X) == prod(shape[1:2])) + p <- as.integer(shape[1]) # Predictor "height" + q <- as.integer(shape[2]) # Predictor "width" + dim(X) <- c(n, p, q) + } else if (length(dim(X)) == 3L) { + p <- dim(X)[2] + q <- dim(X)[3] + } else { + stop("'X' must be a matrix or 3-tensor") + } + + if (isTRUE(center)) { + # Center X; X[, i, j] <- X[, i, j] - mean(X[, i, j]) + X <- scale(X, scale = FALSE) + + print(range(attr(X, "scaled:center"))) + + dim(X) <- c(n, p, q) + } + + # Calc left/right side of kronecker structures covariance + # var(X) = var.lhs %x% var.rhs + var.lhs <- .rowMeans(apply(X, 1, crossprod), q * q, n) + dim(var.lhs) <- c(q, q) + var.rhs <- .rowMeans(apply(X, 1, tcrossprod), p * p, n) + dim(var.rhs) <- c(p, p) + + # Estimate scalling factor tr(var(X)) = tr(var.lhs) tr(var.rhs) + trace <- sum(X^2) / n + + list(lhs = var.lhs, rhs = var.rhs, trace = trace) +}