diff --git a/LaTeX/main.tex b/LaTeX/main.tex index fcb5d8f..705b433 100644 --- a/LaTeX/main.tex +++ b/LaTeX/main.tex @@ -306,10 +306,11 @@ because with $\|\mat{R}_i\|_F^2 = \tr \mat{R}_i\t{\mat{R}_i} = \tr \t{\mat{R}_i} \todo{ prove they are consistent, especially $\widetilde{\mat\Delta} = \tilde{s}^{-1}(\widetilde{\mat\Delta}_1\otimes\widetilde{\mat\Delta}_2)$!} The hoped for a benefit is that these covariance estimates are in a closed form which means there is no need for an additional iterative estimations step. Before we start with the derivation of the gradients define the following two quantities -\begin{displaymath} - \mat{S}_1 = \frac{1}{n}\sum_{i = 1}^n \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\quad{\color{gray}(q\times q)}, \qquad - \mat{S}_2 = \frac{1}{n}\sum_{i = 1}^n \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\quad{\color{gray}(p\times p)}. -\end{displaymath} +\begin{align*} + \mat{S}_1 = \frac{1}{n}\sum_{i = 1}^n \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i = \frac{1}{n}\ten{R}_{(3)}\t{(\ten{R}\ttm[2]\widetilde{\mat{\Delta}}_2^{-1})_{(3)}}\quad{\color{gray}(q\times q)}, \\ + \mat{S}_2 = \frac{1}{n}\sum_{i = 1}^n \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} = \frac{1}{n}\ten{R}_{(2)}\t{(\ten{R}\ttm[3]\widetilde{\mat{\Delta}}_1^{-1})_{(2)}}\quad{\color{gray}(p\times p)}. +\end{align*} +\todo{Check tensor form!} Now, the matrix normal with the covariance matrix of the vectorized quantities of the form $\mat{\Delta} = s^{-1}(\mat{\Delta}_1\otimes\mat{\Delta}_2)$ has the form \begin{align*} diff --git a/simulations/kpir_sim.R b/simulations/kpir_sim.R new file mode 100644 index 0000000..a05cb7e --- /dev/null +++ b/simulations/kpir_sim.R @@ -0,0 +1,985 @@ +library(tensorPredictors) +library(dplyr) +library(ggplot2) + +### Exec all methods for a given data set and collect logs ### +sim <- function(X, Fy, shape, alpha.true, beta.true, max.iter = 500L) { + + # Logger creator + logger <- function(name) { + eval(substitute(function(iter, loss, alpha, beta, ...) { + hist[iter + 1L, ] <<- c( + iter = iter, + loss = loss, + dist = (dist <- dist.subspace(c(kronecker(alpha.true, beta.true)), + c(kronecker(alpha, beta)))), + dist.alpha = (dist.alpha <- dist.subspace(c(alpha.true), c(alpha))), + dist.beta = (dist.beta <- dist.subspace(c( beta.true), c(beta ))), + norm.alpha = norm(alpha, "F"), + norm.beta = norm(beta, "F") + ) + + cat(sprintf( + "%3d | l = %-12.4f - dist = %-.4e - alpha(%d, %d) = %-.4e - beta(%d, %d) = %-.4e\n", + iter, loss, + dist, + nrow(alpha), ncol(alpha), dist.alpha, + nrow(beta), ncol(beta), dist.beta + )) + }, list(hist = as.symbol(paste0("hist.", name))))) + } + + # Initialize logger history targets + hist.base <- hist.new <- hist.momentum <- # hist.kron <- + data.frame(iter = seq(0L, max.iter), + loss = NA, dist = NA, dist.alpha = NA, dist.beta = NA, + norm.alpha = NA, norm.beta = NA + ) + hist.kron <- NULL # TODO: fit kron version + + # Base (old) + kpir.base(X, Fy, shape, max.iter = max.iter, logger = logger("base")) + + # New (simple Gradient Descent) + kpir.new(X, Fy, shape, max.iter = max.iter, logger = logger("new")) + + # Gradient Descent with Nesterov Momentum + kpir.momentum(X, Fy, shape, max.iter = max.iter, logger = logger("momentum")) + + # # Residual Covariance Kronecker product assumpton version + # kpir.kron(X, Fy, shape, max.iter = max.iter, logger = logger("kron")) + + # Add method tags + hist.base$type <- factor("base") + hist.new$type <- factor("new") + hist.momentum$type <- factor("momentum") + # hist.kron$type <- factor("kron") + + # Combine results and return + rbind(hist.base, hist.new, hist.momentum, hist.kron) +} + +## Generate some test data / DEBUG +n <- 200 # Sample Size +p <- sample(1:15, 1) # 11 +q <- sample(1:15, 1) # 3 +k <- sample(1:15, 1) # 7 +r <- sample(1:15, 1) # 5 +print(c(n, p, q, k, r)) + +hist <- NULL +for (rep in 1:20) { + alpha.true <- alpha <- matrix(rnorm(q * r), q, r) + beta.true <- beta <- matrix(rnorm(p * k), p, k) + y <- rnorm(n) + Fy <- do.call(cbind, Map(function(slope, offset) { + sin(slope * y + offset) + }, + head(rep(seq(1, ceiling(0.5 * k * r)), each = 2), k * r), + head(rep(c(0, pi / 2), ceiling(0.5 * k * r)), k * r) + )) + Delta <- 0.5^abs(outer(seq_len(p * q), seq_len(p * q), `-`)) + X <- tcrossprod(Fy, kronecker(alpha, beta)) + CVarE:::rmvnorm(n, sigma = Delta) + + hist.sim <- sim(X, Fy, shape = c(p, q, k, r), alpha.true, beta.true) + hist.sim$repetition <- rep + + hist <- rbind(hist, hist.sim) +} + + +saveRDS(hist, file = "AR.rds") +hist$repetition <- factor(hist$repetition) + +ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) + + geom_line(aes(y = loss)) + + geom_point(data = with(sub <- subset(hist, !is.na(loss)), + aggregate(sub, list(type, repetition), tail, 1) + ), aes(y = loss)) + + labs( + title = bquote(paste("Optimization Objective: negative log-likelihood ", + l(hat(alpha), hat(beta)))), + subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ", + "20 repetitions, ", n == .(n), ", ", + p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))), + x = "nr. of iterations", + y = expression(l(hat(alpha), hat(beta))), + color = "method" + ) + + theme(legend.position = "bottom") + +dev.print(png, file = "sim01_loss.png", width = 768, height = 768, res = 125) + + +ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) + + geom_line(aes(y = dist)) + + geom_point(data = with(sub <- subset(hist, !is.na(dist)), + aggregate(sub, list(type, repetition), tail, 1) + ), aes(y = dist)) + + labs( + title = bquote(paste("Distance of estimate ", hat(B), " to true ", B == alpha %*% beta)), + subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ", + "20 repetitions, ", n == .(n), ", ", + p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))), + x = "nr. of iterations", + y = expression(abs(B * B^T - hat(B) * hat(B)^T)), + color = "method" + ) + + theme(legend.position = "bottom") + +dev.print(png, file = "sim01_dist.png", width = 768, height = 768, res = 125) + + +ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) + + geom_line(aes(y = dist.alpha)) + + geom_point(data = with(sub <- subset(hist, !is.na(dist.alpha)), + aggregate(sub, list(type, repetition), tail, 1) + ), aes(y = dist.alpha)) + + labs( + title = bquote(paste("Distance of estimate ", hat(alpha), " to true ", alpha)), + subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ", + "20 repetitions, ", n == .(n), ", ", + p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))), + x = "nr. of iterations", + y = expression(abs(alpha * alpha^T - hat(alpha) * hat(alpha)^T)), + color = "method" + ) + + theme(legend.position = "bottom") + +dev.print(png, file = "sim01_dist_alpha.png", width = 768, height = 768, res = 125) + + +ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) + + geom_line(aes(y = dist.beta)) + + geom_point(data = with(sub <- subset(hist, !is.na(dist.beta)), + aggregate(sub, list(type, repetition), tail, 1) + ), aes(y = dist.beta)) + + labs( + title = bquote(paste("Distance of estimate ", hat(beta), " to true ", beta)), + subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ", + "20 repetitions, ", n == .(n), ", ", + p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))), + x = "nr. of iterations", + y = expression(abs(beta * beta^T - hat(beta) * hat(beta)^T)), + color = "method" + ) + + theme(legend.position = "bottom") + +dev.print(png, file = "sim01_dist_beta.png", width = 768, height = 768, res = 125) + + +ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) + + geom_line(aes(y = norm.alpha)) + + geom_point(data = with(sub <- subset(hist, !is.na(norm.alpha)), + aggregate(sub, list(type, repetition), tail, 1) + ), aes(y = norm.alpha)) + + labs( + title = expression(paste("Norm of ", hat(alpha))), + subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ", + "20 repetitions, ", n == .(n), ", ", + p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))), + x = "nr. of iterations", + y = expression(abs(hat(alpha))[F]), + color = "method" + ) + + theme(legend.position = "bottom") + +dev.print(png, file = "sim01_norm_alpha.png", width = 768, height = 768, res = 125) + +ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) + + geom_line(aes(y = norm.beta)) + + geom_point(data = with(sub <- subset(hist, !is.na(norm.beta)), + aggregate(sub, list(type, repetition), tail, 1) + ), aes(y = norm.beta)) + + labs( + title = expression(paste("Norm of ", hat(beta))), + subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ", + "20 repetitions, ", n == .(n), ", ", + p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))), + x = "nr. of iterations", + y = expression(abs(hat(beta))[F]), + color = "method" + ) + + theme(legend.position = "bottom") + +dev.print(png, file = "sim01_norm_beta.png", width = 768, height = 768, res = 125) + + + +# local({ +# par(mfrow = c(2, 3), oma = c(2, 1, 1, 1), mar = c(3.1, 2.1, 2.1, 1.1), lwd = 2) +# plot(c(1, max.iter), range(c(hist.base$loss, hist.new$loss, hist.momentum$loss, hist.kron$loss), na.rm = TRUE), +# type = "n", log = "x", main = "loss") +# lines( hist.base$loss, col = 2) +# lines( hist.new$loss, col = 3) +# lines(hist.momentum$loss, col = 4) +# lines( hist.kron$loss, col = 5) +# yrange <- range(c(hist.base$step.size, hist.new$step.size, hist.momentum$step.size, hist.kron$step.size), +# na.rm = TRUE) +# plot(c(1, max.iter), yrange, +# type = "n", log = "x", main = "step.size") +# lines( hist.base$step.size, col = 2) +# lines( hist.new$step.size, col = 3) +# lines(hist.momentum$step.size, col = 4) +# lines( hist.kron$step.size, col = 5) +# # lines( hist.base$step.size, col = 4) # there is no step.size +# plot(0, 0, type = "l", bty = "n", xaxt = "n", yaxt = "n") +# legend("topleft", legend = c("Base", "GD", "GD + Momentum", "Kron + GD + Momentum"), col = 2:5, +# lwd = par("lwd"), xpd = TRUE, horiz = FALSE, cex = 1.2, bty = "n", +# x.intersp = 1, y.intersp = 1.5) +# # xpd = TRUE makes the legend plot to the figure +# plot(c(1, max.iter), range(c(hist.base$dist, hist.new$dist, hist.momentum$dist, hist.kron$dist), na.rm = TRUE), +# type = "n", log = "x", main = "dist") +# lines( hist.base$dist, col = 2) +# lines( hist.new$dist, col = 3) +# lines(hist.momentum$dist, col = 4) +# lines( hist.kron$dist, col = 5) +# plot(c(1, max.iter), range(c(hist.base$dist.alpha, hist.new$dist.alpha, hist.momentum$dist.alpha, hist.kron$dist.alpha), na.rm = TRUE), +# type = "n", log = "x", main = "dist.alpha") +# lines( hist.base$dist.alpha, col = 2) +# lines( hist.new$dist.alpha, col = 3) +# lines(hist.momentum$dist.alpha, col = 4) +# lines( hist.kron$dist.alpha, col = 5) +# plot(c(1, max.iter), range(c(hist.base$dist.beta, hist.new$dist.beta, hist.momentum$dist.beta, hist.kron$dist.beta), na.rm = TRUE), +# type = "n", log = "x", main = "dist.beta") +# lines( hist.base$dist.beta, col = 2) +# lines( hist.new$dist.beta, col = 3) +# lines(hist.momentum$dist.beta, col = 4) +# lines( hist.kron$dist.beta, col = 5) +# # par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) +# # plot(0, 0, type = 'l', bty = 'n', xaxt = 'n', yaxt = 'n') +# # legend('bottom', legend = c('GD', 'GD + Nesterov Momentum', 'Alternating'), col = 2:4, +# # lwd = 5, xpd = TRUE, horiz = TRUE, cex = 1, seg.len = 1, bty = 'n') +# # # xpd = TRUE makes the legend plot to the figure +# }) +# dev.print(png, file = "loss.png", width = 768, height = 768, res = 125) + +# with(list(a1 = alpha, a2 = fit.base$alpha, a3 = fit.new$alpha, a4 = fit.momentum$alpha, +# b1 = beta, b2 = fit.base$beta, b3 = fit.new$beta, b4 = fit.momentum$beta), { +# par(mfrow = c(2, 4)) +# a2 <- sign(sum(sign(a1 * a2))) * a2 +# a3 <- sign(sum(sign(a1 * a3))) * a3 +# a4 <- sign(sum(sign(a1 * a4))) * a4 +# b2 <- sign(sum(sign(b1 * b2))) * b2 +# b3 <- sign(sum(sign(b1 * b3))) * b3 +# b4 <- sign(sum(sign(b1 * b4))) * b4 + +# matrixImage(a1, main = expression(alpha)) +# matrixImage(a2, main = expression(paste(hat(alpha)["Base"]))) +# matrixImage(a3, main = expression(paste(hat(alpha)["GD"]))) +# matrixImage(a4, main = expression(paste(hat(alpha)["GD+Nest"]))) +# matrixImage(b1, main = expression(beta)) +# matrixImage(b2, main = expression(paste(hat(beta)["Base"]))) +# matrixImage(b3, main = expression(paste(hat(beta)["GD"]))) +# matrixImage(b4, main = expression(paste(hat(beta)["GD+Nest"]))) +# }) +# dev.print(png, file = "estimates.png", width = 768, height = 768, res = 125) + + +# with(list(d1 = Delta, d2 = fit.base$Delta, d3 = fit.new$Delta, d4 = fit.momentum$Delta), { +# par(mfrow = c(2, 2)) + +# matrixImage(d1, main = expression(Delta)) +# matrixImage(d2, main = expression(hat(Delta)["Base"])) +# matrixImage(d3, main = expression(hat(Delta)["GD"])) +# matrixImage(d4, main = expression(hat(Delta)["GD+Nest"])) +# }) +# dev.print(png, file = "Delta.png", width = 768, height = 768, res = 125) + +# with(list(a1 = alpha, a2 = fit.base$alpha, a3 = fit.new$alpha, a4 = fit.momentum$alpha, +# b1 = beta, b2 = fit.base$beta, b3 = fit.new$beta, b4 = fit.momentum$beta), { +# par(mfrow = c(2, 2)) + +# matrixImage(kronecker(a1, b1), main = expression(B)) +# matrixImage(kronecker(a2, b2), main = expression(hat(B)["Base"])) +# matrixImage(kronecker(a3, b3), main = expression(hat(B)["GD"])) +# matrixImage(kronecker(a4, b4), main = expression(hat(B)["GD+Nest"])) +# }) +# dev.print(png, file = "B.png", width = 768, height = 768, res = 125) + +# with(list(a1 = alpha, a2 = fit.base$alpha, a3 = fit.new$alpha, a4 = fit.momentum$alpha, +# b1 = beta, b2 = fit.base$beta, b3 = fit.new$beta, b4 = fit.momentum$beta), { +# par(mfrow = c(3, 1), lwd = 1) + +# d2 <- kronecker(a1, b1) - kronecker(a2, b2) +# d3 <- kronecker(a1, b1) - kronecker(a3, b3) +# d4 <- kronecker(a1, b1) - kronecker(a4, b4) +# xlim <- c(-1, 1) * max(abs(c(d2, d3, d4))) +# breaks <- seq(xlim[1], xlim[2], len = 41) + +# hist(d2, main = expression(paste(base, (B - hat(B))[i])), +# breaks = breaks, xlim = xlim, freq = FALSE) +# lines(density(d2), col = 2) +# abline(v = range(d2), lty = 2) +# hist(d3, main = expression(paste(GD, (B - hat(B))[i])), +# breaks = breaks, xlim = xlim, freq = FALSE) +# lines(density(d3), col = 3) +# abline(v = range(d3), lty = 2) +# hist(d4, main = expression(paste(GD + Nest, (B - hat(B))[i])), +# breaks = breaks, xlim = xlim, freq = FALSE) +# lines(density(d4), col = 4) +# abline(v = range(d4), lty = 2) +# }) +# dev.print(png, file = "hist.png", width = 768, height = 768, res = 125) + + + + + + + + +# options(width = 300) +# print(pr <- prof.tree::prof.tree("./Rprof.out"), limit = NULL +# , pruneFun = function(x) x$percent > 0.01) + +# par(mfrow = c(2, 2)) +# matrixImage(alpha, main = "alpha") +# matrixImage(fit$alpha, main = "fit$alpha") +# matrixImage(beta, main = "beta") +# matrixImage(fit$beta, main = "fit$beta") + +# if (diff(dim(alpha) * dim(beta)) > 0) { +# par(mfrow = c(2, 1)) +# } else { +# par(mfrow = c(1, 2)) +# } +# matrixImage(kronecker(alpha, beta), main = "kronecker(alpha, beta)") +# matrixImage(kronecker(fit$alpha, fit$beta), main = "kronecker(fit$alpha, fit$beta)") + +# matrixImage(Delta, main = "Delta") +# matrixImage(fit$Delta, main = "fit$Delta") + + +# local({ +# a <- (-1 * (sum(sign(fit$alpha) * sign(alpha)) < 0)) * fit$alpha / mean(fit$alpha^2) +# b <- alpha / mean(alpha^2) + +# norm(a - b, "F") +# }) +# local({ +# a <- (-1 * (sum(sign(fit$beta) * sign(beta)) < 0)) * fit$beta / mean(fit$beta^2) +# b <- beta / mean(beta^2) + +# norm(a - b, "F") +# }) + + +# # Which Sequence? +# x <- y <- 1 +# replicate(40, x <<- (y <<- x + y) - x) + + +# # Face-Splitting Product +# n <- 100 +# p <- 3 +# q <- 500 +# A <- matrix(rnorm(n * p), n) +# B <- matrix(rnorm(n * q), n) + +# faceSplit <- function(A, B) { +# C <- vapply(seq_len(ncol(A)), function(i) A[, i] * B, B) +# dim(C) <- c(nrow(A), ncol(A) * ncol(B)) +# C +# } + +# all.equal( +# tkhatriRao(A, B), +# faceSplit(A, B) +# ) +# microbenchmark::microbenchmark( +# tkhatriRao(A, B), +# faceSplit(A, B) +# ) + + + + + + +# dist.kron <- function(a0, b0, a1, b1) { +# sqrt(sum(a0^2) * sum(b0^2) - +# 2 * sum(diag(crossprod(a0, a1))) * sum(diag(crossprod(b0, b1))) + +# sum(a1^2) * sum(b1^2)) +# } + + +# alpha <- matrix(rnorm(q * r), q) +# beta <- matrix(rnorm(p * k), p) +# alpha.true <- matrix(rnorm(q * r), q) +# beta.true <- matrix(rnorm(p * k), p) +# all.equal( +# dist.kron(alpha, beta, alpha.true, beta.true), +# norm(kronecker(alpha, beta) - kronecker(alpha.true, beta.true), "F") +# ) + +# A <- matrix(rnorm(p^2), p) +# B <- matrix(rnorm(p^2), p) + +# tr <- function(A) sum(diag(A)) +# tr(crossprod(A, B)) +# tr(tcrossprod(B, A)) + +# tr(crossprod(A, A)) +# tr(tcrossprod(A, A)) +# sum(A^2) + +# alpha <- matrix(rnorm(q * r), q) +# beta <- matrix(rnorm(p * k), p) + +# norm(kronecker(alpha, beta), "F")^2 +# norm(alpha, "F")^2 * norm(beta, "F")^2 +# tr(crossprod(kronecker(alpha, beta))) +# tr(tcrossprod(kronecker(alpha, beta))) +# tr(crossprod(kronecker(t(alpha), t(beta)))) +# tr(crossprod(alpha)) * tr(crossprod(beta)) +# tr(tcrossprod(alpha)) * tr(tcrossprod(beta)) +# tr(crossprod(alpha)) * tr(tcrossprod(beta)) +# sum(alpha^2) * sum(beta^2) + +# alpha <- matrix(rnorm(q * r), q) +# beta <- matrix(rnorm(p * k), p) +# alpha.true <- matrix(rnorm(q * r), q) +# beta.true <- matrix(rnorm(p * k), p) +# microbenchmark::microbenchmark( +# norm(kronecker(alpha, beta), "F")^2, +# norm(alpha, "F")^2 * norm(beta, "F")^2, +# tr(crossprod(kronecker(alpha, beta))), +# tr(tcrossprod(kronecker(alpha, beta))), +# tr(crossprod(kronecker(t(alpha), t(beta)))), +# tr(crossprod(alpha)) * tr(crossprod(beta)), +# tr(tcrossprod(alpha)) * tr(tcrossprod(beta)), +# tr(crossprod(alpha)) * tr(tcrossprod(beta)), +# sum(alpha^2) * sum(beta^2), + +# setup = { +# p <- sample(1:10, 1) +# q <- sample(1:10, 1) +# k <- sample(1:10, 1) +# r <- sample(1:10, 1) +# assign("alpha", matrix(rnorm(q * r), q), .GlobalEnv) +# assign("beta", matrix(rnorm(p * k), p), .GlobalEnv) +# assign("alpha.true", matrix(rnorm(q * r), q), .GlobalEnv) +# assign("beta.true", matrix(rnorm(p * k), p), .GlobalEnv) +# } +# ) + + + +# p <- sample(1:15, 1) # 11 +# q <- sample(1:15, 1) # 3 +# k <- sample(1:15, 1) # 7 +# r <- sample(1:15, 1) # 5 +# A <- matrix(rnorm(q * r), q) +# B <- matrix(rnorm(p * k), p) +# a <- matrix(rnorm(q * r), q) +# b <- matrix(rnorm(p * k), p) + +# all.equal( +# kronecker(A + a, B + b), +# kronecker(A, B) + kronecker(A, b) + kronecker(a, B) + kronecker(a, b) +# ) + + +# p <- 200L +# n <- 100L +# R <- matrix(rnorm(n * p), n) +# A <- matrix(rnorm(p^2), p) # Base Matrix +# B <- A + 0.01 * matrix(rnorm(p^2), p) # Distortion / Update of A +# A.inv <- solve(A) + +# microbenchmark::microbenchmark( +# solve = R %*% solve(B), +# neumann.raw = R %*% (A.inv + A.inv %*% (A - B) %*% A.inv + A.inv %*% (A - B) %*% A.inv %*% (A - B) %*% A.inv), +# neumann.fun = { +# AD <- A.inv %*% (A - B) +# res <- A.inv + AD %*% A.inv +# res <- A.inv + AD %*% res +# R %*% res +# } +# ) + +# all.equal( +# A.inv + A.inv %*% (A - B) %*% A.inv + A.inv %*% (A - B) %*% A.inv %*% (A - B) %*% A.inv, +# { +# DA <- (A - B) %*% A.inv +# res <- A.inv + A.inv %*% DA +# res <- A.inv + res %*% DA +# res +# } +# ) + +# all.equal( +# A.inv + A.inv %*% (A - B) %*% A.inv + A.inv %*% (A - B) %*% A.inv %*% (A - B) %*% A.inv, +# { +# AD <- A.inv %*% (A - B) +# res <- A.inv + AD %*% A.inv +# res <- A.inv + AD %*% res +# res +# } +# ) + +# ##### +# sym <- function(A) A + t(A) +# n <- 101 +# p <- 7 +# q <- 11 +# r <- 3 +# k <- 5 +# R <- array(rnorm(n * p * q), dim = c(n = n, p = p, q = q)) +# F <- array(rnorm(n * k * r), dim = c(n = n, k = k, r = r)) +# alpha <- array(rnorm(q * r), dim = c(q = q, r = r)) +# beta <- array(rnorm(p * k), dim = c(p = p, k = k)) +# Delta.1 <- sym(matrix(rnorm(q * q), q, q)) +# dim(Delta.1) <- c(q = q, q = q) +# Delta.2 <- sym(matrix(rnorm(p * p), p, p)) +# dim(Delta.2) <- c(p = p, p = p) + +# Delta <- kronecker(Delta.1, Delta.2) + +# grad.alpha.1 <- local({ +# Di.1 <- solve(Delta.1) +# Di.2 <- solve(Delta.2) +# .R <- sapply(seq_len(n), function(i) tcrossprod(Di.1, R[i, , ]) %*% Di.2) +# dim(.R) <- c(q, p, n) +# .F <- sapply(seq_len(n), function(i) beta %*% F[i, , ]) +# dim(.F) <- c(p, r, n) + +# .C <- sapply(seq_len(n), function(i) .R[i, , ] %*% .F[i, , ]) +# dim(.C) <- c(n, q, r) +# colSums(.C) +# }) +# grad.alpha.2 <- local({ +# # Delta.1^-1 R' Delta.2^-1 +# .R <- aperm(R, c(2, 1, 3)) +# dim(.R) <- c(q, n * p) +# .R <- solve(Delta.1) %*% .R +# dim(.R) <- c(q, n, p) +# .R <- aperm(.R, c(3, 2, 1)) +# dim(.R) <- c(p, n * q) +# .R <- solve(Delta.2) %*% .R +# dim(.R) <- c(p, n, q) +# .R <- aperm(.R, c(2, 3, 1)) # n x q x p + +# # beta F +# .F <- aperm(F, c(2, 1, 3)) +# dim(.F) <- c(k, n * r) +# .F <- beta %*% .F +# dim(.F) <- c(p, n, r) +# .F <- aperm(.F, c(2, 1, 3)) # n x p x r + +# # (Delta.1^-1 R' Delta.2^-1) (beta F) +# .R <- aperm(.R, c(1, 3, 2)) +# dim(.R) <- c(n * p, q) +# dim(.F) <- c(n * p, r) +# crossprod(.R, .F) +# }) + +# all.equal( +# grad.alpha.1, +# grad.alpha.2 +# ) + +# all.equal({ +# .R <- matrix(0, q, p) +# Di.1 <- solve(Delta.1) +# Di.2 <- solve(Delta.2) +# for (i in 1:n) { +# .R <- .R + Di.1 %*% t(R[i, , ]) %*% Di.2 +# } +# .R +# }, { +# .R <- R +# Di.1 <- solve(Delta.1) +# Di.2 <- solve(Delta.2) +# .R <- sapply(seq_len(n), function(i) tcrossprod(Di.1, R[i, , ]) %*% Di.2) +# dim(.R) <- c(q, p, n) +# .R <- aperm(.R, c(3, 1, 2)) +# colSums(.R) +# }) +# all.equal({ +# Di.1 <- solve(Delta.1) +# Di.2 <- solve(Delta.2) +# .R <- sapply(seq_len(n), function(i) tcrossprod(Di.1, R[i, , ]) %*% Di.2) +# dim(.R) <- c(q, p, n) +# .R <- aperm(.R, c(3, 1, 2)) +# .R +# }, { +# .R <- R +# dim(.R) <- c(n * p, q) +# .R <- .R %*% solve(Delta.1) +# dim(.R) <- c(n, p, q) +# .R <- aperm(.R, c(1, 3, 2)) +# dim(.R) <- c(n * q, p) +# .R <- .R %*% solve(Delta.2) +# dim(.R) <- c(n, q, p) +# .R +# }) +# all.equal({ +# .F <- matrix(0, p, r) +# for (i in 1:n) { +# .F <- .F + beta %*% F[i, , ] +# } +# .F +# }, { +# .F <- apply(F, 1, function(Fi) beta %*% Fi) +# dim(.F) <- c(p, r, n) +# .F <- aperm(.F, c(3, 1, 2)) +# colSums(.F) +# }) +# all.equal({ +# .F <- apply(F, 1, function(Fi) beta %*% Fi) +# dim(.F) <- c(p, r, n) +# .F <- aperm(.F, c(3, 1, 2)) +# colSums(.F) +# }, { +# .F <- aperm(F, c(1, 3, 2)) +# dim(.F) <- c(n * r, k) +# .F <- tcrossprod(.F, beta) +# dim(.F) <- c(n, r, p) +# t(colSums(.F)) +# }) + +# all.equal({ +# Di.1 <- solve(Delta.1) +# Di.2 <- solve(Delta.2) +# grad.alpha <- 0 +# grad.beta <- 0 +# dim(R) <- c(n, p, q) +# dim(F) <- c(n, k, r) +# for (i in 1:n) { +# grad.alpha <- grad.alpha + ( +# Di.1 %*% t(R[i, , ]) %*% Di.2 %*% beta %*% F[i, , ] +# ) +# grad.beta <- grad.beta + ( +# Di.2 %*% R[i, , ] %*% Di.1 %*% alpha %*% t(F[i, , ]) +# ) +# } + +# g1 <- c(dim(grad.alpha), dim(grad.beta), grad.alpha, grad.beta) +# }, { +# # Note that the order is important since for grad.beta the residuals do NOT +# # need to be transposes. + +# # left/right standardized residuals Delta_1^-1 R_i' Delta_2^-1 for i in 1:n +# dim(R) <- c(n * p, q) +# .R <- R %*% solve(Delta.1) +# dim(.R) <- c(n, p, q) +# .R <- aperm(.R, c(1, 3, 2)) +# dim(.R) <- c(n * q, p) +# .R <- .R %*% solve(Delta.2) +# dim(.R) <- c(n, q, p) + +# # gradient with respect to beta +# # Responces times beta (alpha f_i') +# dim(F) <- c(n * k, r) +# .F <- tcrossprod(F, alpha) +# dim(.F) <- c(n, k, q) +# .F <- aperm(.F, c(1, 3, 2)) +# # Matricize +# dim(.R) <- c(n * q, p) +# dim(.F) <- c(n * q, k) +# grad.beta <- crossprod(.R, .F) + +# # gradient with respect to beta +# # Responces times alpha +# dim(F) <- c(n, k, r) +# .F <- aperm(F, c(1, 3, 2)) +# dim(.F) <- c(n * r, k) +# .F <- tcrossprod(.F, beta) +# dim(.F) <- c(n, r, p) +# .F <- aperm(.F, c(1, 3, 2)) +# # Transpose stand. residuals +# dim(.R) <- c(n, q, p) +# .R <- aperm(.R, c(1, 3, 2)) +# # Matricize +# dim(.R) <- c(n * p, q) +# dim(.F) <- c(n * p, r) +# grad.alpha <- crossprod(.R, .F) + +# g2 <- c(dim(grad.alpha), dim(grad.beta), grad.alpha, grad.beta) +# }) + + +# microbenchmark::microbenchmark(R1 = { +# Di.1 <- solve(Delta.1) +# Di.2 <- solve(Delta.2) +# grad.alpha <- 0 # matrix(0, q, r) +# grad.beta <- 0 # matrix(0, p, k) +# dim(R) <- c(n, p, q) +# dim(F) <- c(n, k, r) +# for (i in 1:n) { +# grad.alpha <- grad.alpha + ( +# Di.1 %*% t(R[i, , ]) %*% Di.2 %*% beta %*% F[i, , ] +# ) +# grad.beta <- grad.beta + ( +# Di.2 %*% R[i, , ] %*% Di.1 %*% alpha %*% t(F[i, , ]) +# ) +# } + +# g1 <- c(dim(grad.alpha), dim(grad.beta), grad.alpha, grad.beta) +# }, R3 = { +# # Note that the order is important since for grad.beta the residuals do NOT +# # need to be transposes. + +# # left/right standardized residuals Delta_1^-1 R_i' Delta_2^-1 for i in 1:n +# dim(R) <- c(n * p, q) +# .R <- R %*% solve(Delta.1) +# dim(.R) <- c(n, p, q) +# .R <- aperm(.R, c(1, 3, 2)) +# dim(.R) <- c(n * q, p) +# .R <- .R %*% solve(Delta.2) +# dim(.R) <- c(n, q, p) + +# # gradient with respect to beta +# # Responces times beta (alpha f_i') +# dim(F) <- c(n * k, r) +# .F <- tcrossprod(F, alpha) +# dim(.F) <- c(n, k, q) +# .F <- aperm(.F, c(1, 3, 2)) +# # Matricize +# dim(.R) <- c(n * q, p) +# dim(.F) <- c(n * q, k) +# grad.beta <- crossprod(.R, .F) + +# # gradient with respect to beta +# # Responces times alpha +# dim(F) <- c(n, k, r) +# .F <- aperm(F, c(1, 3, 2)) +# dim(.F) <- c(n * r, k) +# .F <- tcrossprod(.F, beta) +# dim(.F) <- c(n, r, p) +# .F <- aperm(.F, c(1, 3, 2)) +# # Transpose stand. residuals +# dim(.R) <- c(n, q, p) +# .R <- aperm(.R, c(1, 3, 2)) +# # Matricize +# dim(.R) <- c(n * p, q) +# dim(.F) <- c(n * p, r) +# grad.alpha <- crossprod(.R, .F) + +# g2 <- c(dim(grad.alpha), dim(grad.beta), grad.alpha, grad.beta) +# }) + + +# n <- 100 +# p <- 7 +# q <- 11 +# k <- 3 +# r <- 5 + +# X <- array(rnorm(n * p * q), dim = c(n = n, p = p, q = q)) +# F <- array(rnorm(n * k * r), dim = c(n = n, k = k, r = r)) +# alpha <- array(rnorm(q * r), dim = c(q = q, r = r)) +# beta <- array(rnorm(p * k), dim = c(p = p, k = k)) + +# all.equal({ +# R <- array(NA, dim = c(n, p, q)) +# for (i in 1:n) { +# R[i, , ] <- X[i, , ] - beta %*% F[i, , ] %*% t(alpha) +# } +# R +# }, { +# X - (F %x_3% alpha %x_2% beta) +# }, check.attributes = FALSE) + +# microbenchmark::microbenchmark(base = { +# R <- array(NA, dim = c(n, p, q)) +# for (i in 1:n) { +# R[i, , ] <- X[i, , ] - beta %*% F[i, , ] %*% t(alpha) +# } +# R +# }, ttm = { +# X - (F %x_3% alpha %x_2% beta) +# }) + + + +# n <- 100; p <- 7; q <- 11; k <- 3; r <- 5 +# sym <- function(x) t(x) + x +# Di.1 <- sym(matrix(rnorm(q^2), q, q)) +# Di.2 <- sym(matrix(rnorm(p^2), p, p)) +# R <- array(rnorm(n, p, q), dim = c(n, p, q)) +# F <- array(rnorm(n, k, r), dim = c(n, k, r)) +# alpha <- matrix(rnorm(q * r), q, r) +# beta <- matrix(rnorm(p * k), p, k) + +# all.equal({ +# .R <- array(NA, dim = c(n, p, q)) +# for (i in 1:n) { +# .R[i, , ] <- Di.2 %*% R[i, , ] %*% Di.1 +# } +# .R +# }, { +# R %x_3% Di.1 %x_2% Di.2 +# }) +# all.equal({ +# .Rt <- array(NA, dim = c(n, q, p)) +# for (i in 1:n) { +# .Rt[i, , ] <- Di.1 %*% t(R[i, , ]) %*% Di.2 +# } +# .Rt +# }, { +# .Rt <- R %x_3% Di.1 %x_2% Di.2 +# aperm(.Rt, c(1, 3, 2)) +# }) +# all.equal({ +# .Fa <- array(NA, dim = c(n, q, k)) +# for (i in 1:n) { +# .Fa[i, , ] <- alpha %*% t(F[i, , ]) +# } +# .Fa +# }, { +# aperm(F %x_3% alpha, c(1, 3, 2)) +# }) +# all.equal({ +# .Fb <- array(NA, dim = c(n, p, r)) +# for (i in 1:n) { +# .Fb[i, , ] <- beta %*% F[i, , ] +# } +# .Fb +# }, { +# F %x_2% beta +# }) +# all.equal({ +# .F <- array(NA, dim = c(n, p, q)) +# for (i in 1:n) { +# .F[i, , ] <- beta %*% F[i, , ] %*% t(alpha) +# } +# .F +# }, { +# F %x_3% alpha %x_2% beta +# }) +# all.equal({ +# .Ga <- 0 +# for (i in 1:n) { +# .Ga <- .Ga + Di.1 %*% t(R[i, , ]) %*% Di.2 %*% beta %*% F[i, , ] +# } +# .Ga +# }, { +# .R <- R %x_3% Di.1 %x_2% Di.2 +# dim(.R) <- c(n * p, q) +# .Fb <- F %x_2% beta +# dim(.Fb) <- c(n * p, r) +# crossprod(.R, .Fb) +# }) +# all.equal({ +# .Gb <- 0 +# for (i in 1:n) { +# .Gb <- .Gb + Di.2 %*% R[i, , ] %*% Di.1 %*% alpha %*% t(F[i, , ]) +# } +# .Gb +# }, { +# .Rt <- aperm(R %x_3% Di.1 %x_2% Di.2, c(1, 3, 2)) +# dim(.Rt) <- c(n * q, p) +# .Fa <- aperm(F %x_3% alpha, c(1, 3, 2)) +# dim(.Fa) <- c(n * q, k) +# crossprod(.Rt, .Fa) +# }) +# all.equal({ +# .Ga <- 0 +# for (i in 1:n) { +# .Ga <- .Ga + Di.1 %*% t(R[i, , ]) %*% Di.2 %*% beta %*% F[i, , ] +# } +# .Gb <- 0 +# for (i in 1:n) { +# .Gb <- .Gb + Di.2 %*% R[i, , ] %*% Di.1 %*% alpha %*% t(F[i, , ]) +# } +# c(.Ga, .Gb) +# }, { +# .R <- R %x_3% Di.1 %x_2% Di.2 +# .Fb <- F %x_2% beta +# .Fa <- aperm(F %x_3% alpha, c(1, 3, 2)) + +# dim(.R) <- c(n * p, q) +# dim(.Fb) <- c(n * p, r) +# .Ga <- crossprod(.R, .Fb) + +# dim(.R) <- c(n, p, q) +# .R <- aperm(.R, c(1, 3, 2)) +# dim(.R) <- c(n * q, p) +# dim(.Fa) <- c(n * q, k) +# .Gb <- crossprod(.R, .Fa) + +# c(.Ga, .Gb) +# }) +# all.equal({ +# .Ga <- 0 +# for (i in 1:n) { +# .Ga <- .Ga + Di.1 %*% t(R[i, , ]) %*% Di.2 %*% beta %*% F[i, , ] +# } +# .Gb <- 0 +# for (i in 1:n) { +# .Gb <- .Gb + Di.2 %*% R[i, , ] %*% Di.1 %*% alpha %*% t(F[i, , ]) +# } +# c(.Ga, .Gb) +# }, { +# .R <- R %x_3% Di.1 %x_2% Di.2 + +# .Ga <- tcrossprod(mat(.R, 3), mat(F %x_2% beta, 3)) +# .Gb <- tcrossprod(mat(.R, 2), mat(F %x_3% alpha, 2)) + +# c(.Ga, .Gb) +# }) + + + + + +# n <- 101; p <- 5; q <- 7 + +# sym <- function(x) crossprod(x) +# D1 <- sym(matrix(rnorm(q^2), q, q)) +# D2 <- sym(matrix(rnorm(p^2), p, p)) + +# X <- tensorPredictors:::rmvnorm(n, sigma = kronecker(D1, D2)) +# dim(X) <- c(n, p, q) + +# D1.hat <- tcrossprod(mat(X, 3)) / n +# D2.hat <- tcrossprod(mat(X, 2)) / n + +# local({ +# par(mfrow = c(2, 2)) +# matrixImage(D1, main = "D1") +# matrixImage(D1.hat, main = "D1.hat") +# matrixImage(D2, main = "D2") +# matrixImage(D2.hat, main = "D2.hat") +# }) + +# sum(X^2) / n +# sum(diag(D1.hat)) +# sum(diag(D2.hat)) +# sum(diag(kronecker(D1, D2))) +# sum(diag(kronecker(D1.hat / sqrt(sum(diag(D1.hat))), +# D2.hat / sqrt(sum(diag(D1.hat)))))) + +# all.equal({ +# mat(X, 1) %*% kronecker(D1.hat, D2.hat) +# }, { +# mat(X %x_3% D1.hat %x_2% D2.hat, 1) +# }) +# all.equal({ +# C <- mat(X, 1) %*% kronecker(D1.hat, D2.hat) * (n / sum(X^2)) +# dim(C) <- c(n, p, q) +# C +# }, { +# (X %x_3% D1.hat %x_2% D2.hat) / sum(diag(D1.hat)) +# }) + + + + +# D.1 <- tcrossprod(mat(X, 3)) +# D.2 <- tcrossprod(mat(X, 2)) +# tr <- sum(diag(D.1)) +# D.1 <- D.1 / sqrt(n * tr) +# D.2 <- D.2 / sqrt(n * tr) + +# sum(diag(kronecker(D1, D2))) +# sum(diag(kronecker(D.1, D.2))) +# det(kronecker(D1, D2)) +# det(kronecker(D.1, D.2)) +# det(D.1)^p * det(D.2)^q + +# log(det(kronecker(D.1, D.2))) +# p * log(det(D.1)) + q * log(det(D.2)) diff --git a/tensorPredictors/R/kpir_approx.R b/tensorPredictors/R/kpir_approx.R new file mode 100644 index 0000000..28cd564 --- /dev/null +++ b/tensorPredictors/R/kpir_approx.R @@ -0,0 +1,136 @@ +#' Using unbiased (but not MLE) estimates for the Kronecker decomposed +#' covariance matrices Delta_1, Delta_2 for approximating the log-likelihood +#' giving a closed form solution for the gradient. +#' +#' Delta_1 = n^-1 sum_i R_i' R_i, +#' Delta_2 = n^-1 sum_i R_i R_i'. +#' +#' @export +kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), + max.iter = 500L, max.line.iter = 50L, step.size = 1e-3, + nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)), + eps = .Machine$double.eps, + logger = NULL +) { + + # Check if X and Fy have same number of observations + stopifnot(nrow(X) == NROW(Fy)) + n <- nrow(X) # Number of observations + + # Get and check predictor dimensions + if (length(dim(X)) == 2L) { + stopifnot(!missing(shape)) + stopifnot(ncol(X) == prod(shape[1:2])) + p <- as.integer(shape[1]) # Predictor "height" + q <- as.integer(shape[2]) # Predictor "width" + } else if (length(dim(X)) == 3L) { + p <- dim(X)[2] + q <- dim(X)[3] + dim(X) <- c(n, p * q) + } else { + stop("'X' must be a matrix or 3-tensor") + } + + # Get and check response dimensions + if (!is.array(Fy)) { + Fy <- as.array(Fy) + } + if (length(dim(Fy)) == 1L) { + k <- r <- 1L + dim(Fy) <- c(n, 1L) + } else if (length(dim(Fy)) == 2L) { + stopifnot(!missing(shape)) + stopifnot(ncol(Fy) == prod(shape[3:4])) + k <- as.integer(shape[3]) # Response functional "height" + r <- as.integer(shape[4]) # Response functional "width" + } else if (length(dim(Fy)) == 3L) { + k <- dim(Fy)[2] + r <- dim(Fy)[3] + dim(Fy) <- c(n, k * r) + } else { + stop("'Fy' must be a vector, matrix or 3-tensor") + } + + + ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon` + # Vectorize + dim(Fy) <- c(n, k * r) + dim(X) <- c(n, p * q) + # Solve + cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace + if (n <= k * r || qr(cpFy)$rank < k * r) { + # In case of under-determined system replace the inverse in the normal + # equation by the Moore-Penrose Pseudo Inverse + B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X)) + } else { + # Compute OLS estimate by the Normal Equation + B <- t(solve(cpFy, crossprod(Fy, X))) + } + + # De-Vectroize (from now on tensor arithmetics) + dim(Fy) <- c(n, k, r) + dim(X) <- c(n, p, q) + + # Decompose `B = alpha x beta` into `alpha` and `beta` + c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k)) + + # Compute residuals + R <- X - (Fy %x_3% alpha0 %x_2% beta0) + + # Covariance estimates and scaling factor + Delta.1 <- tcrossprod(mat(R, 3)) + Delta.2 <- tcrossprod(mat(R, 2)) + s <- sum(diag(Delta.1)) + + # Inverse Covariances + Delta.1.inv <- solve(Delta.1) + Delta.2.inv <- solve(Delta.2) + + # cross dependent covariance estimates + S.1 <- n^-1 * tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) + S.2 <- n^-1 * tcrossprod(mat(R, 2), mat(R %x_3% Delta.1.inv, 2)) + + # Evaluate negative log-likelihood (2 pi term dropped) + loss <- -0.5 * n * (p * q * log(s) - p * log(det(Delta.1)) - + q * log(det(Delta.2)) - s * sum(S.1 * Delta.1.inv)) + + # Gradient "generating" tensor + G <- (sum(S.1 * Delta.1.inv) - p * q / s) * R + G <- G + R %x_2% ((diag(q, p, p) - s * (Delta.2.inv %*% S.2)) %*% Delta.2.inv) + G <- G + R %x_3% ((diag(p, q, q) - s * (Delta.1.inv %*% S.1)) %*% Delta.1.inv) + G <- G + s * (R %x_2% Delta.2.inv %x_3% Delta.1.inv) + + # Call history callback (logger) before the first iteration + if (is.function(logger)) { + logger(0L, loss, alpha0, beta0, Delta.1, Delta.2, NA) + } + + + ### Step 2: MLE estimate with LS solution as starting value + a0 <- 0 + a1 <- 1 + alpha1 <- alpha0 + beta1 <- beta0 + + # main descent loop + no.nesterov <- TRUE + break.reason <- NA + for (iter in seq_len(max.iter)) { + if (no.nesterov) { + # without extrapolation as fallback + alpha.moment <- alpha1 + beta.moment <- beta1 + } else { + # extrapolation using previous direction + alpha.moment <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0) + beta.moment <- beta1 + ((a0 - 1) / a1) * ( beta1 - beta0) + } + } + + # Extrapolated residuals + R <- X - (Fy %x_3% alpha.moment %x_2% beta.moment) + + + + list(loss = loss, alpha = alpha1, beta = beta1, Delta = Delta) +} diff --git a/tensorPredictors/R/kpir_base.R b/tensorPredictors/R/kpir_base.R index 3a3673b..8d5ab01 100644 --- a/tensorPredictors/R/kpir_base.R +++ b/tensorPredictors/R/kpir_base.R @@ -1,12 +1,50 @@ #' (Slightly altered) old implementation #' #' @export -kpir.base <- function(X, Fy, p, t, k = 1L, r = 1L, d1 = 1L, d2 = 1L, +kpir.base <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), method = c("mle", "ls"), eps1 = 1e-10, eps2 = 1e-10, max.iter = 500L, logger = NULL ) { + # Check if X and Fy have same number of observations + stopifnot(nrow(X) == NROW(Fy)) + n <- nrow(X) # Number of observations + + # Get and check predictor dimensions + if (length(dim(X)) == 2L) { + stopifnot(!missing(shape)) + stopifnot(ncol(X) == prod(shape[1:2])) + p <- as.integer(shape[1]) # Predictor "height" + q <- as.integer(shape[2]) # Predictor "width" + } else if (length(dim(X)) == 3L) { + p <- dim(X)[2] + q <- dim(X)[3] + dim(X) <- c(n, p * q) + } else { + stop("'X' must be a matrix or 3-tensor") + } + + # Get and check response dimensions + if (!is.array(Fy)) { + Fy <- as.array(Fy) + } + if (length(dim(Fy)) == 1L) { + k <- r <- 1L + dim(Fy) <- c(n, 1L) + } else if (length(dim(Fy)) == 2L) { + stopifnot(!missing(shape)) + stopifnot(ncol(Fy) == prod(shape[3:4])) + k <- as.integer(shape[3]) # Response functional "height" + r <- as.integer(shape[4]) # Response functional "width" + } else if (length(dim(Fy)) == 3L) { + k <- dim(Fy)[2] + r <- dim(Fy)[3] + dim(Fy) <- c(n, k * r) + } else { + stop("'Fy' must be a vector, matrix or 3-tensor") + } + log.likelihood <- function(par, X, Fy, Delta.inv, da, db) { alpha <- matrix(par[1:prod(da)], da[1L]) beta <- matrix(par[(prod(da) + 1):length(par)], db[1L]) @@ -17,10 +55,6 @@ kpir.base <- function(X, Fy, p, t, k = 1L, r = 1L, d1 = 1L, d2 = 1L, # Validate method using unexact matching. method <- match.arg(method) - # ## Step 1: - # # OLS estimate of the model `X = F_y B + epsilon`. - # B <- t(solve(crossprod(Fy), crossprod(Fy, X))) - ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon` cpFy <- crossprod(Fy) if (n <= k * r || qr(cpFy)$rank < k * r) { @@ -33,7 +67,7 @@ kpir.base <- function(X, Fy, p, t, k = 1L, r = 1L, d1 = 1L, d2 = 1L, } # Estimate alpha, beta as nearest kronecker approximation. - c(alpha, beta) %<-% approx.kronecker(B, c(t, r), c(p, k)) + c(alpha, beta) %<-% approx.kronecker(B, c(q, r), c(p, k)) if (method == "ls") { # Estimate Delta. @@ -63,13 +97,13 @@ kpir.base <- function(X, Fy, p, t, k = 1L, r = 1L, d1 = 1L, d2 = 1L, for (iter in 1:max.iter) { # Optimize log-likelihood for alpha, beta with fixed Delta. opt <- optim(c(alpha, beta), log.likelihood, gr = NULL, - X, Fy, matpow(Delta, -1), c(t, r), c(p, k)) + X, Fy, matpow(Delta, -1), c(q, r), c(p, k)) # Store previous alpha, beta and Delta (for break consition). Delta.last <- Delta B.last <- B # Extract optimized alpha, beta. - alpha <- matrix(opt$par[1:(t * r)], t, r) - beta <- matrix(opt$par[(t * r + 1):length(opt$par)], p, k) + alpha <- matrix(opt$par[1:(q * r)], q, r) + beta <- matrix(opt$par[(q * r + 1):length(opt$par)], p, k) # Calc new Delta with likelihood optimized alpha, beta. B <- kronecker(alpha, beta) resid <- X - tcrossprod(Fy, B) @@ -85,9 +119,9 @@ kpir.base <- function(X, Fy, p, t, k = 1L, r = 1L, d1 = 1L, d2 = 1L, } # Check break condition 1. - if (norm(Delta - Delta.last, 'F') < eps1 * norm(Delta, 'F')) { + if (norm(Delta - Delta.last, "F") < eps1 * norm(Delta, "F")) { # Check break condition 2. - if (norm(B - B.last, 'F') < eps2 * norm(B, 'F')) { + if (norm(B - B.last, "F") < eps2 * norm(B, "F")) { break } } diff --git a/tensorPredictors/R/kpir_kron.R b/tensorPredictors/R/kpir_kron.R index bbdf5ff..7b05985 100644 --- a/tensorPredictors/R/kpir_kron.R +++ b/tensorPredictors/R/kpir_kron.R @@ -65,7 +65,7 @@ kpir.kron <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), # De-Vectroize (from now on tensor arithmetics) dim(Fy) <- c(n, k, r) dim(X) <- c(n, p, q) - + # Decompose `B = alpha x beta` into `alpha` and `beta` c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k)) diff --git a/tensorPredictors/R/kpir_momentum.R b/tensorPredictors/R/kpir_momentum.R index 3ac5319..9471171 100644 --- a/tensorPredictors/R/kpir_momentum.R +++ b/tensorPredictors/R/kpir_momentum.R @@ -1,10 +1,10 @@ -#' Gradient Descent Bases Tensor Predictors method with Nesterov Accelerated +#' Gradient Descent based Tensor Predictors method with Nesterov Accelerated #' Momentum #' #' @export kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), max.iter = 500L, max.line.iter = 50L, step.size = 1e-3, - nesterov.scaling = function(a, t) { 0.5 * (1 + sqrt(1 + (2 * a)^2)) }, + nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)), eps = .Machine$double.eps, logger = NULL ) {