wip: writing next method kpir_approx

This commit is contained in:
Daniel Kapla 2022-04-29 18:37:25 +02:00
parent b2c6d3ca56
commit 203028e255
6 changed files with 1174 additions and 18 deletions

View File

@ -306,10 +306,11 @@ because with $\|\mat{R}_i\|_F^2 = \tr \mat{R}_i\t{\mat{R}_i} = \tr \t{\mat{R}_i}
\todo{ prove they are consistent, especially $\widetilde{\mat\Delta} = \tilde{s}^{-1}(\widetilde{\mat\Delta}_1\otimes\widetilde{\mat\Delta}_2)$!} \todo{ prove they are consistent, especially $\widetilde{\mat\Delta} = \tilde{s}^{-1}(\widetilde{\mat\Delta}_1\otimes\widetilde{\mat\Delta}_2)$!}
The hoped for a benefit is that these covariance estimates are in a closed form which means there is no need for an additional iterative estimations step. Before we start with the derivation of the gradients define the following two quantities The hoped for a benefit is that these covariance estimates are in a closed form which means there is no need for an additional iterative estimations step. Before we start with the derivation of the gradients define the following two quantities
\begin{displaymath} \begin{align*}
\mat{S}_1 = \frac{1}{n}\sum_{i = 1}^n \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\quad{\color{gray}(q\times q)}, \qquad \mat{S}_1 = \frac{1}{n}\sum_{i = 1}^n \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i = \frac{1}{n}\ten{R}_{(3)}\t{(\ten{R}\ttm[2]\widetilde{\mat{\Delta}}_2^{-1})_{(3)}}\quad{\color{gray}(q\times q)}, \\
\mat{S}_2 = \frac{1}{n}\sum_{i = 1}^n \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\quad{\color{gray}(p\times p)}. \mat{S}_2 = \frac{1}{n}\sum_{i = 1}^n \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} = \frac{1}{n}\ten{R}_{(2)}\t{(\ten{R}\ttm[3]\widetilde{\mat{\Delta}}_1^{-1})_{(2)}}\quad{\color{gray}(p\times p)}.
\end{displaymath} \end{align*}
\todo{Check tensor form!}
Now, the matrix normal with the covariance matrix of the vectorized quantities of the form $\mat{\Delta} = s^{-1}(\mat{\Delta}_1\otimes\mat{\Delta}_2)$ has the form Now, the matrix normal with the covariance matrix of the vectorized quantities of the form $\mat{\Delta} = s^{-1}(\mat{\Delta}_1\otimes\mat{\Delta}_2)$ has the form
\begin{align*} \begin{align*}

985
simulations/kpir_sim.R Normal file
View File

@ -0,0 +1,985 @@
library(tensorPredictors)
library(dplyr)
library(ggplot2)
### Exec all methods for a given data set and collect logs ###
sim <- function(X, Fy, shape, alpha.true, beta.true, max.iter = 500L) {
# Logger creator
logger <- function(name) {
eval(substitute(function(iter, loss, alpha, beta, ...) {
hist[iter + 1L, ] <<- c(
iter = iter,
loss = loss,
dist = (dist <- dist.subspace(c(kronecker(alpha.true, beta.true)),
c(kronecker(alpha, beta)))),
dist.alpha = (dist.alpha <- dist.subspace(c(alpha.true), c(alpha))),
dist.beta = (dist.beta <- dist.subspace(c( beta.true), c(beta ))),
norm.alpha = norm(alpha, "F"),
norm.beta = norm(beta, "F")
)
cat(sprintf(
"%3d | l = %-12.4f - dist = %-.4e - alpha(%d, %d) = %-.4e - beta(%d, %d) = %-.4e\n",
iter, loss,
dist,
nrow(alpha), ncol(alpha), dist.alpha,
nrow(beta), ncol(beta), dist.beta
))
}, list(hist = as.symbol(paste0("hist.", name)))))
}
# Initialize logger history targets
hist.base <- hist.new <- hist.momentum <- # hist.kron <-
data.frame(iter = seq(0L, max.iter),
loss = NA, dist = NA, dist.alpha = NA, dist.beta = NA,
norm.alpha = NA, norm.beta = NA
)
hist.kron <- NULL # TODO: fit kron version
# Base (old)
kpir.base(X, Fy, shape, max.iter = max.iter, logger = logger("base"))
# New (simple Gradient Descent)
kpir.new(X, Fy, shape, max.iter = max.iter, logger = logger("new"))
# Gradient Descent with Nesterov Momentum
kpir.momentum(X, Fy, shape, max.iter = max.iter, logger = logger("momentum"))
# # Residual Covariance Kronecker product assumpton version
# kpir.kron(X, Fy, shape, max.iter = max.iter, logger = logger("kron"))
# Add method tags
hist.base$type <- factor("base")
hist.new$type <- factor("new")
hist.momentum$type <- factor("momentum")
# hist.kron$type <- factor("kron")
# Combine results and return
rbind(hist.base, hist.new, hist.momentum, hist.kron)
}
## Generate some test data / DEBUG
n <- 200 # Sample Size
p <- sample(1:15, 1) # 11
q <- sample(1:15, 1) # 3
k <- sample(1:15, 1) # 7
r <- sample(1:15, 1) # 5
print(c(n, p, q, k, r))
hist <- NULL
for (rep in 1:20) {
alpha.true <- alpha <- matrix(rnorm(q * r), q, r)
beta.true <- beta <- matrix(rnorm(p * k), p, k)
y <- rnorm(n)
Fy <- do.call(cbind, Map(function(slope, offset) {
sin(slope * y + offset)
},
head(rep(seq(1, ceiling(0.5 * k * r)), each = 2), k * r),
head(rep(c(0, pi / 2), ceiling(0.5 * k * r)), k * r)
))
Delta <- 0.5^abs(outer(seq_len(p * q), seq_len(p * q), `-`))
X <- tcrossprod(Fy, kronecker(alpha, beta)) + CVarE:::rmvnorm(n, sigma = Delta)
hist.sim <- sim(X, Fy, shape = c(p, q, k, r), alpha.true, beta.true)
hist.sim$repetition <- rep
hist <- rbind(hist, hist.sim)
}
saveRDS(hist, file = "AR.rds")
hist$repetition <- factor(hist$repetition)
ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) +
geom_line(aes(y = loss)) +
geom_point(data = with(sub <- subset(hist, !is.na(loss)),
aggregate(sub, list(type, repetition), tail, 1)
), aes(y = loss)) +
labs(
title = bquote(paste("Optimization Objective: negative log-likelihood ",
l(hat(alpha), hat(beta)))),
subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ",
"20 repetitions, ", n == .(n), ", ",
p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))),
x = "nr. of iterations",
y = expression(l(hat(alpha), hat(beta))),
color = "method"
) +
theme(legend.position = "bottom")
dev.print(png, file = "sim01_loss.png", width = 768, height = 768, res = 125)
ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) +
geom_line(aes(y = dist)) +
geom_point(data = with(sub <- subset(hist, !is.na(dist)),
aggregate(sub, list(type, repetition), tail, 1)
), aes(y = dist)) +
labs(
title = bquote(paste("Distance of estimate ", hat(B), " to true ", B == alpha %*% beta)),
subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ",
"20 repetitions, ", n == .(n), ", ",
p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))),
x = "nr. of iterations",
y = expression(abs(B * B^T - hat(B) * hat(B)^T)),
color = "method"
) +
theme(legend.position = "bottom")
dev.print(png, file = "sim01_dist.png", width = 768, height = 768, res = 125)
ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) +
geom_line(aes(y = dist.alpha)) +
geom_point(data = with(sub <- subset(hist, !is.na(dist.alpha)),
aggregate(sub, list(type, repetition), tail, 1)
), aes(y = dist.alpha)) +
labs(
title = bquote(paste("Distance of estimate ", hat(alpha), " to true ", alpha)),
subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ",
"20 repetitions, ", n == .(n), ", ",
p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))),
x = "nr. of iterations",
y = expression(abs(alpha * alpha^T - hat(alpha) * hat(alpha)^T)),
color = "method"
) +
theme(legend.position = "bottom")
dev.print(png, file = "sim01_dist_alpha.png", width = 768, height = 768, res = 125)
ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) +
geom_line(aes(y = dist.beta)) +
geom_point(data = with(sub <- subset(hist, !is.na(dist.beta)),
aggregate(sub, list(type, repetition), tail, 1)
), aes(y = dist.beta)) +
labs(
title = bquote(paste("Distance of estimate ", hat(beta), " to true ", beta)),
subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ",
"20 repetitions, ", n == .(n), ", ",
p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))),
x = "nr. of iterations",
y = expression(abs(beta * beta^T - hat(beta) * hat(beta)^T)),
color = "method"
) +
theme(legend.position = "bottom")
dev.print(png, file = "sim01_dist_beta.png", width = 768, height = 768, res = 125)
ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) +
geom_line(aes(y = norm.alpha)) +
geom_point(data = with(sub <- subset(hist, !is.na(norm.alpha)),
aggregate(sub, list(type, repetition), tail, 1)
), aes(y = norm.alpha)) +
labs(
title = expression(paste("Norm of ", hat(alpha))),
subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ",
"20 repetitions, ", n == .(n), ", ",
p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))),
x = "nr. of iterations",
y = expression(abs(hat(alpha))[F]),
color = "method"
) +
theme(legend.position = "bottom")
dev.print(png, file = "sim01_norm_alpha.png", width = 768, height = 768, res = 125)
ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) +
geom_line(aes(y = norm.beta)) +
geom_point(data = with(sub <- subset(hist, !is.na(norm.beta)),
aggregate(sub, list(type, repetition), tail, 1)
), aes(y = norm.beta)) +
labs(
title = expression(paste("Norm of ", hat(beta))),
subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ",
"20 repetitions, ", n == .(n), ", ",
p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))),
x = "nr. of iterations",
y = expression(abs(hat(beta))[F]),
color = "method"
) +
theme(legend.position = "bottom")
dev.print(png, file = "sim01_norm_beta.png", width = 768, height = 768, res = 125)
# local({
# par(mfrow = c(2, 3), oma = c(2, 1, 1, 1), mar = c(3.1, 2.1, 2.1, 1.1), lwd = 2)
# plot(c(1, max.iter), range(c(hist.base$loss, hist.new$loss, hist.momentum$loss, hist.kron$loss), na.rm = TRUE),
# type = "n", log = "x", main = "loss")
# lines( hist.base$loss, col = 2)
# lines( hist.new$loss, col = 3)
# lines(hist.momentum$loss, col = 4)
# lines( hist.kron$loss, col = 5)
# yrange <- range(c(hist.base$step.size, hist.new$step.size, hist.momentum$step.size, hist.kron$step.size),
# na.rm = TRUE)
# plot(c(1, max.iter), yrange,
# type = "n", log = "x", main = "step.size")
# lines( hist.base$step.size, col = 2)
# lines( hist.new$step.size, col = 3)
# lines(hist.momentum$step.size, col = 4)
# lines( hist.kron$step.size, col = 5)
# # lines( hist.base$step.size, col = 4) # there is no step.size
# plot(0, 0, type = "l", bty = "n", xaxt = "n", yaxt = "n")
# legend("topleft", legend = c("Base", "GD", "GD + Momentum", "Kron + GD + Momentum"), col = 2:5,
# lwd = par("lwd"), xpd = TRUE, horiz = FALSE, cex = 1.2, bty = "n",
# x.intersp = 1, y.intersp = 1.5)
# # xpd = TRUE makes the legend plot to the figure
# plot(c(1, max.iter), range(c(hist.base$dist, hist.new$dist, hist.momentum$dist, hist.kron$dist), na.rm = TRUE),
# type = "n", log = "x", main = "dist")
# lines( hist.base$dist, col = 2)
# lines( hist.new$dist, col = 3)
# lines(hist.momentum$dist, col = 4)
# lines( hist.kron$dist, col = 5)
# plot(c(1, max.iter), range(c(hist.base$dist.alpha, hist.new$dist.alpha, hist.momentum$dist.alpha, hist.kron$dist.alpha), na.rm = TRUE),
# type = "n", log = "x", main = "dist.alpha")
# lines( hist.base$dist.alpha, col = 2)
# lines( hist.new$dist.alpha, col = 3)
# lines(hist.momentum$dist.alpha, col = 4)
# lines( hist.kron$dist.alpha, col = 5)
# plot(c(1, max.iter), range(c(hist.base$dist.beta, hist.new$dist.beta, hist.momentum$dist.beta, hist.kron$dist.beta), na.rm = TRUE),
# type = "n", log = "x", main = "dist.beta")
# lines( hist.base$dist.beta, col = 2)
# lines( hist.new$dist.beta, col = 3)
# lines(hist.momentum$dist.beta, col = 4)
# lines( hist.kron$dist.beta, col = 5)
# # par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE)
# # plot(0, 0, type = 'l', bty = 'n', xaxt = 'n', yaxt = 'n')
# # legend('bottom', legend = c('GD', 'GD + Nesterov Momentum', 'Alternating'), col = 2:4,
# # lwd = 5, xpd = TRUE, horiz = TRUE, cex = 1, seg.len = 1, bty = 'n')
# # # xpd = TRUE makes the legend plot to the figure
# })
# dev.print(png, file = "loss.png", width = 768, height = 768, res = 125)
# with(list(a1 = alpha, a2 = fit.base$alpha, a3 = fit.new$alpha, a4 = fit.momentum$alpha,
# b1 = beta, b2 = fit.base$beta, b3 = fit.new$beta, b4 = fit.momentum$beta), {
# par(mfrow = c(2, 4))
# a2 <- sign(sum(sign(a1 * a2))) * a2
# a3 <- sign(sum(sign(a1 * a3))) * a3
# a4 <- sign(sum(sign(a1 * a4))) * a4
# b2 <- sign(sum(sign(b1 * b2))) * b2
# b3 <- sign(sum(sign(b1 * b3))) * b3
# b4 <- sign(sum(sign(b1 * b4))) * b4
# matrixImage(a1, main = expression(alpha))
# matrixImage(a2, main = expression(paste(hat(alpha)["Base"])))
# matrixImage(a3, main = expression(paste(hat(alpha)["GD"])))
# matrixImage(a4, main = expression(paste(hat(alpha)["GD+Nest"])))
# matrixImage(b1, main = expression(beta))
# matrixImage(b2, main = expression(paste(hat(beta)["Base"])))
# matrixImage(b3, main = expression(paste(hat(beta)["GD"])))
# matrixImage(b4, main = expression(paste(hat(beta)["GD+Nest"])))
# })
# dev.print(png, file = "estimates.png", width = 768, height = 768, res = 125)
# with(list(d1 = Delta, d2 = fit.base$Delta, d3 = fit.new$Delta, d4 = fit.momentum$Delta), {
# par(mfrow = c(2, 2))
# matrixImage(d1, main = expression(Delta))
# matrixImage(d2, main = expression(hat(Delta)["Base"]))
# matrixImage(d3, main = expression(hat(Delta)["GD"]))
# matrixImage(d4, main = expression(hat(Delta)["GD+Nest"]))
# })
# dev.print(png, file = "Delta.png", width = 768, height = 768, res = 125)
# with(list(a1 = alpha, a2 = fit.base$alpha, a3 = fit.new$alpha, a4 = fit.momentum$alpha,
# b1 = beta, b2 = fit.base$beta, b3 = fit.new$beta, b4 = fit.momentum$beta), {
# par(mfrow = c(2, 2))
# matrixImage(kronecker(a1, b1), main = expression(B))
# matrixImage(kronecker(a2, b2), main = expression(hat(B)["Base"]))
# matrixImage(kronecker(a3, b3), main = expression(hat(B)["GD"]))
# matrixImage(kronecker(a4, b4), main = expression(hat(B)["GD+Nest"]))
# })
# dev.print(png, file = "B.png", width = 768, height = 768, res = 125)
# with(list(a1 = alpha, a2 = fit.base$alpha, a3 = fit.new$alpha, a4 = fit.momentum$alpha,
# b1 = beta, b2 = fit.base$beta, b3 = fit.new$beta, b4 = fit.momentum$beta), {
# par(mfrow = c(3, 1), lwd = 1)
# d2 <- kronecker(a1, b1) - kronecker(a2, b2)
# d3 <- kronecker(a1, b1) - kronecker(a3, b3)
# d4 <- kronecker(a1, b1) - kronecker(a4, b4)
# xlim <- c(-1, 1) * max(abs(c(d2, d3, d4)))
# breaks <- seq(xlim[1], xlim[2], len = 41)
# hist(d2, main = expression(paste(base, (B - hat(B))[i])),
# breaks = breaks, xlim = xlim, freq = FALSE)
# lines(density(d2), col = 2)
# abline(v = range(d2), lty = 2)
# hist(d3, main = expression(paste(GD, (B - hat(B))[i])),
# breaks = breaks, xlim = xlim, freq = FALSE)
# lines(density(d3), col = 3)
# abline(v = range(d3), lty = 2)
# hist(d4, main = expression(paste(GD + Nest, (B - hat(B))[i])),
# breaks = breaks, xlim = xlim, freq = FALSE)
# lines(density(d4), col = 4)
# abline(v = range(d4), lty = 2)
# })
# dev.print(png, file = "hist.png", width = 768, height = 768, res = 125)
# options(width = 300)
# print(pr <- prof.tree::prof.tree("./Rprof.out"), limit = NULL
# , pruneFun = function(x) x$percent > 0.01)
# par(mfrow = c(2, 2))
# matrixImage(alpha, main = "alpha")
# matrixImage(fit$alpha, main = "fit$alpha")
# matrixImage(beta, main = "beta")
# matrixImage(fit$beta, main = "fit$beta")
# if (diff(dim(alpha) * dim(beta)) > 0) {
# par(mfrow = c(2, 1))
# } else {
# par(mfrow = c(1, 2))
# }
# matrixImage(kronecker(alpha, beta), main = "kronecker(alpha, beta)")
# matrixImage(kronecker(fit$alpha, fit$beta), main = "kronecker(fit$alpha, fit$beta)")
# matrixImage(Delta, main = "Delta")
# matrixImage(fit$Delta, main = "fit$Delta")
# local({
# a <- (-1 * (sum(sign(fit$alpha) * sign(alpha)) < 0)) * fit$alpha / mean(fit$alpha^2)
# b <- alpha / mean(alpha^2)
# norm(a - b, "F")
# })
# local({
# a <- (-1 * (sum(sign(fit$beta) * sign(beta)) < 0)) * fit$beta / mean(fit$beta^2)
# b <- beta / mean(beta^2)
# norm(a - b, "F")
# })
# # Which Sequence?
# x <- y <- 1
# replicate(40, x <<- (y <<- x + y) - x)
# # Face-Splitting Product
# n <- 100
# p <- 3
# q <- 500
# A <- matrix(rnorm(n * p), n)
# B <- matrix(rnorm(n * q), n)
# faceSplit <- function(A, B) {
# C <- vapply(seq_len(ncol(A)), function(i) A[, i] * B, B)
# dim(C) <- c(nrow(A), ncol(A) * ncol(B))
# C
# }
# all.equal(
# tkhatriRao(A, B),
# faceSplit(A, B)
# )
# microbenchmark::microbenchmark(
# tkhatriRao(A, B),
# faceSplit(A, B)
# )
# dist.kron <- function(a0, b0, a1, b1) {
# sqrt(sum(a0^2) * sum(b0^2) -
# 2 * sum(diag(crossprod(a0, a1))) * sum(diag(crossprod(b0, b1))) +
# sum(a1^2) * sum(b1^2))
# }
# alpha <- matrix(rnorm(q * r), q)
# beta <- matrix(rnorm(p * k), p)
# alpha.true <- matrix(rnorm(q * r), q)
# beta.true <- matrix(rnorm(p * k), p)
# all.equal(
# dist.kron(alpha, beta, alpha.true, beta.true),
# norm(kronecker(alpha, beta) - kronecker(alpha.true, beta.true), "F")
# )
# A <- matrix(rnorm(p^2), p)
# B <- matrix(rnorm(p^2), p)
# tr <- function(A) sum(diag(A))
# tr(crossprod(A, B))
# tr(tcrossprod(B, A))
# tr(crossprod(A, A))
# tr(tcrossprod(A, A))
# sum(A^2)
# alpha <- matrix(rnorm(q * r), q)
# beta <- matrix(rnorm(p * k), p)
# norm(kronecker(alpha, beta), "F")^2
# norm(alpha, "F")^2 * norm(beta, "F")^2
# tr(crossprod(kronecker(alpha, beta)))
# tr(tcrossprod(kronecker(alpha, beta)))
# tr(crossprod(kronecker(t(alpha), t(beta))))
# tr(crossprod(alpha)) * tr(crossprod(beta))
# tr(tcrossprod(alpha)) * tr(tcrossprod(beta))
# tr(crossprod(alpha)) * tr(tcrossprod(beta))
# sum(alpha^2) * sum(beta^2)
# alpha <- matrix(rnorm(q * r), q)
# beta <- matrix(rnorm(p * k), p)
# alpha.true <- matrix(rnorm(q * r), q)
# beta.true <- matrix(rnorm(p * k), p)
# microbenchmark::microbenchmark(
# norm(kronecker(alpha, beta), "F")^2,
# norm(alpha, "F")^2 * norm(beta, "F")^2,
# tr(crossprod(kronecker(alpha, beta))),
# tr(tcrossprod(kronecker(alpha, beta))),
# tr(crossprod(kronecker(t(alpha), t(beta)))),
# tr(crossprod(alpha)) * tr(crossprod(beta)),
# tr(tcrossprod(alpha)) * tr(tcrossprod(beta)),
# tr(crossprod(alpha)) * tr(tcrossprod(beta)),
# sum(alpha^2) * sum(beta^2),
# setup = {
# p <- sample(1:10, 1)
# q <- sample(1:10, 1)
# k <- sample(1:10, 1)
# r <- sample(1:10, 1)
# assign("alpha", matrix(rnorm(q * r), q), .GlobalEnv)
# assign("beta", matrix(rnorm(p * k), p), .GlobalEnv)
# assign("alpha.true", matrix(rnorm(q * r), q), .GlobalEnv)
# assign("beta.true", matrix(rnorm(p * k), p), .GlobalEnv)
# }
# )
# p <- sample(1:15, 1) # 11
# q <- sample(1:15, 1) # 3
# k <- sample(1:15, 1) # 7
# r <- sample(1:15, 1) # 5
# A <- matrix(rnorm(q * r), q)
# B <- matrix(rnorm(p * k), p)
# a <- matrix(rnorm(q * r), q)
# b <- matrix(rnorm(p * k), p)
# all.equal(
# kronecker(A + a, B + b),
# kronecker(A, B) + kronecker(A, b) + kronecker(a, B) + kronecker(a, b)
# )
# p <- 200L
# n <- 100L
# R <- matrix(rnorm(n * p), n)
# A <- matrix(rnorm(p^2), p) # Base Matrix
# B <- A + 0.01 * matrix(rnorm(p^2), p) # Distortion / Update of A
# A.inv <- solve(A)
# microbenchmark::microbenchmark(
# solve = R %*% solve(B),
# neumann.raw = R %*% (A.inv + A.inv %*% (A - B) %*% A.inv + A.inv %*% (A - B) %*% A.inv %*% (A - B) %*% A.inv),
# neumann.fun = {
# AD <- A.inv %*% (A - B)
# res <- A.inv + AD %*% A.inv
# res <- A.inv + AD %*% res
# R %*% res
# }
# )
# all.equal(
# A.inv + A.inv %*% (A - B) %*% A.inv + A.inv %*% (A - B) %*% A.inv %*% (A - B) %*% A.inv,
# {
# DA <- (A - B) %*% A.inv
# res <- A.inv + A.inv %*% DA
# res <- A.inv + res %*% DA
# res
# }
# )
# all.equal(
# A.inv + A.inv %*% (A - B) %*% A.inv + A.inv %*% (A - B) %*% A.inv %*% (A - B) %*% A.inv,
# {
# AD <- A.inv %*% (A - B)
# res <- A.inv + AD %*% A.inv
# res <- A.inv + AD %*% res
# res
# }
# )
# #####
# sym <- function(A) A + t(A)
# n <- 101
# p <- 7
# q <- 11
# r <- 3
# k <- 5
# R <- array(rnorm(n * p * q), dim = c(n = n, p = p, q = q))
# F <- array(rnorm(n * k * r), dim = c(n = n, k = k, r = r))
# alpha <- array(rnorm(q * r), dim = c(q = q, r = r))
# beta <- array(rnorm(p * k), dim = c(p = p, k = k))
# Delta.1 <- sym(matrix(rnorm(q * q), q, q))
# dim(Delta.1) <- c(q = q, q = q)
# Delta.2 <- sym(matrix(rnorm(p * p), p, p))
# dim(Delta.2) <- c(p = p, p = p)
# Delta <- kronecker(Delta.1, Delta.2)
# grad.alpha.1 <- local({
# Di.1 <- solve(Delta.1)
# Di.2 <- solve(Delta.2)
# .R <- sapply(seq_len(n), function(i) tcrossprod(Di.1, R[i, , ]) %*% Di.2)
# dim(.R) <- c(q, p, n)
# .F <- sapply(seq_len(n), function(i) beta %*% F[i, , ])
# dim(.F) <- c(p, r, n)
# .C <- sapply(seq_len(n), function(i) .R[i, , ] %*% .F[i, , ])
# dim(.C) <- c(n, q, r)
# colSums(.C)
# })
# grad.alpha.2 <- local({
# # Delta.1^-1 R' Delta.2^-1
# .R <- aperm(R, c(2, 1, 3))
# dim(.R) <- c(q, n * p)
# .R <- solve(Delta.1) %*% .R
# dim(.R) <- c(q, n, p)
# .R <- aperm(.R, c(3, 2, 1))
# dim(.R) <- c(p, n * q)
# .R <- solve(Delta.2) %*% .R
# dim(.R) <- c(p, n, q)
# .R <- aperm(.R, c(2, 3, 1)) # n x q x p
# # beta F
# .F <- aperm(F, c(2, 1, 3))
# dim(.F) <- c(k, n * r)
# .F <- beta %*% .F
# dim(.F) <- c(p, n, r)
# .F <- aperm(.F, c(2, 1, 3)) # n x p x r
# # (Delta.1^-1 R' Delta.2^-1) (beta F)
# .R <- aperm(.R, c(1, 3, 2))
# dim(.R) <- c(n * p, q)
# dim(.F) <- c(n * p, r)
# crossprod(.R, .F)
# })
# all.equal(
# grad.alpha.1,
# grad.alpha.2
# )
# all.equal({
# .R <- matrix(0, q, p)
# Di.1 <- solve(Delta.1)
# Di.2 <- solve(Delta.2)
# for (i in 1:n) {
# .R <- .R + Di.1 %*% t(R[i, , ]) %*% Di.2
# }
# .R
# }, {
# .R <- R
# Di.1 <- solve(Delta.1)
# Di.2 <- solve(Delta.2)
# .R <- sapply(seq_len(n), function(i) tcrossprod(Di.1, R[i, , ]) %*% Di.2)
# dim(.R) <- c(q, p, n)
# .R <- aperm(.R, c(3, 1, 2))
# colSums(.R)
# })
# all.equal({
# Di.1 <- solve(Delta.1)
# Di.2 <- solve(Delta.2)
# .R <- sapply(seq_len(n), function(i) tcrossprod(Di.1, R[i, , ]) %*% Di.2)
# dim(.R) <- c(q, p, n)
# .R <- aperm(.R, c(3, 1, 2))
# .R
# }, {
# .R <- R
# dim(.R) <- c(n * p, q)
# .R <- .R %*% solve(Delta.1)
# dim(.R) <- c(n, p, q)
# .R <- aperm(.R, c(1, 3, 2))
# dim(.R) <- c(n * q, p)
# .R <- .R %*% solve(Delta.2)
# dim(.R) <- c(n, q, p)
# .R
# })
# all.equal({
# .F <- matrix(0, p, r)
# for (i in 1:n) {
# .F <- .F + beta %*% F[i, , ]
# }
# .F
# }, {
# .F <- apply(F, 1, function(Fi) beta %*% Fi)
# dim(.F) <- c(p, r, n)
# .F <- aperm(.F, c(3, 1, 2))
# colSums(.F)
# })
# all.equal({
# .F <- apply(F, 1, function(Fi) beta %*% Fi)
# dim(.F) <- c(p, r, n)
# .F <- aperm(.F, c(3, 1, 2))
# colSums(.F)
# }, {
# .F <- aperm(F, c(1, 3, 2))
# dim(.F) <- c(n * r, k)
# .F <- tcrossprod(.F, beta)
# dim(.F) <- c(n, r, p)
# t(colSums(.F))
# })
# all.equal({
# Di.1 <- solve(Delta.1)
# Di.2 <- solve(Delta.2)
# grad.alpha <- 0
# grad.beta <- 0
# dim(R) <- c(n, p, q)
# dim(F) <- c(n, k, r)
# for (i in 1:n) {
# grad.alpha <- grad.alpha + (
# Di.1 %*% t(R[i, , ]) %*% Di.2 %*% beta %*% F[i, , ]
# )
# grad.beta <- grad.beta + (
# Di.2 %*% R[i, , ] %*% Di.1 %*% alpha %*% t(F[i, , ])
# )
# }
# g1 <- c(dim(grad.alpha), dim(grad.beta), grad.alpha, grad.beta)
# }, {
# # Note that the order is important since for grad.beta the residuals do NOT
# # need to be transposes.
# # left/right standardized residuals Delta_1^-1 R_i' Delta_2^-1 for i in 1:n
# dim(R) <- c(n * p, q)
# .R <- R %*% solve(Delta.1)
# dim(.R) <- c(n, p, q)
# .R <- aperm(.R, c(1, 3, 2))
# dim(.R) <- c(n * q, p)
# .R <- .R %*% solve(Delta.2)
# dim(.R) <- c(n, q, p)
# # gradient with respect to beta
# # Responces times beta (alpha f_i')
# dim(F) <- c(n * k, r)
# .F <- tcrossprod(F, alpha)
# dim(.F) <- c(n, k, q)
# .F <- aperm(.F, c(1, 3, 2))
# # Matricize
# dim(.R) <- c(n * q, p)
# dim(.F) <- c(n * q, k)
# grad.beta <- crossprod(.R, .F)
# # gradient with respect to beta
# # Responces times alpha
# dim(F) <- c(n, k, r)
# .F <- aperm(F, c(1, 3, 2))
# dim(.F) <- c(n * r, k)
# .F <- tcrossprod(.F, beta)
# dim(.F) <- c(n, r, p)
# .F <- aperm(.F, c(1, 3, 2))
# # Transpose stand. residuals
# dim(.R) <- c(n, q, p)
# .R <- aperm(.R, c(1, 3, 2))
# # Matricize
# dim(.R) <- c(n * p, q)
# dim(.F) <- c(n * p, r)
# grad.alpha <- crossprod(.R, .F)
# g2 <- c(dim(grad.alpha), dim(grad.beta), grad.alpha, grad.beta)
# })
# microbenchmark::microbenchmark(R1 = {
# Di.1 <- solve(Delta.1)
# Di.2 <- solve(Delta.2)
# grad.alpha <- 0 # matrix(0, q, r)
# grad.beta <- 0 # matrix(0, p, k)
# dim(R) <- c(n, p, q)
# dim(F) <- c(n, k, r)
# for (i in 1:n) {
# grad.alpha <- grad.alpha + (
# Di.1 %*% t(R[i, , ]) %*% Di.2 %*% beta %*% F[i, , ]
# )
# grad.beta <- grad.beta + (
# Di.2 %*% R[i, , ] %*% Di.1 %*% alpha %*% t(F[i, , ])
# )
# }
# g1 <- c(dim(grad.alpha), dim(grad.beta), grad.alpha, grad.beta)
# }, R3 = {
# # Note that the order is important since for grad.beta the residuals do NOT
# # need to be transposes.
# # left/right standardized residuals Delta_1^-1 R_i' Delta_2^-1 for i in 1:n
# dim(R) <- c(n * p, q)
# .R <- R %*% solve(Delta.1)
# dim(.R) <- c(n, p, q)
# .R <- aperm(.R, c(1, 3, 2))
# dim(.R) <- c(n * q, p)
# .R <- .R %*% solve(Delta.2)
# dim(.R) <- c(n, q, p)
# # gradient with respect to beta
# # Responces times beta (alpha f_i')
# dim(F) <- c(n * k, r)
# .F <- tcrossprod(F, alpha)
# dim(.F) <- c(n, k, q)
# .F <- aperm(.F, c(1, 3, 2))
# # Matricize
# dim(.R) <- c(n * q, p)
# dim(.F) <- c(n * q, k)
# grad.beta <- crossprod(.R, .F)
# # gradient with respect to beta
# # Responces times alpha
# dim(F) <- c(n, k, r)
# .F <- aperm(F, c(1, 3, 2))
# dim(.F) <- c(n * r, k)
# .F <- tcrossprod(.F, beta)
# dim(.F) <- c(n, r, p)
# .F <- aperm(.F, c(1, 3, 2))
# # Transpose stand. residuals
# dim(.R) <- c(n, q, p)
# .R <- aperm(.R, c(1, 3, 2))
# # Matricize
# dim(.R) <- c(n * p, q)
# dim(.F) <- c(n * p, r)
# grad.alpha <- crossprod(.R, .F)
# g2 <- c(dim(grad.alpha), dim(grad.beta), grad.alpha, grad.beta)
# })
# n <- 100
# p <- 7
# q <- 11
# k <- 3
# r <- 5
# X <- array(rnorm(n * p * q), dim = c(n = n, p = p, q = q))
# F <- array(rnorm(n * k * r), dim = c(n = n, k = k, r = r))
# alpha <- array(rnorm(q * r), dim = c(q = q, r = r))
# beta <- array(rnorm(p * k), dim = c(p = p, k = k))
# all.equal({
# R <- array(NA, dim = c(n, p, q))
# for (i in 1:n) {
# R[i, , ] <- X[i, , ] - beta %*% F[i, , ] %*% t(alpha)
# }
# R
# }, {
# X - (F %x_3% alpha %x_2% beta)
# }, check.attributes = FALSE)
# microbenchmark::microbenchmark(base = {
# R <- array(NA, dim = c(n, p, q))
# for (i in 1:n) {
# R[i, , ] <- X[i, , ] - beta %*% F[i, , ] %*% t(alpha)
# }
# R
# }, ttm = {
# X - (F %x_3% alpha %x_2% beta)
# })
# n <- 100; p <- 7; q <- 11; k <- 3; r <- 5
# sym <- function(x) t(x) + x
# Di.1 <- sym(matrix(rnorm(q^2), q, q))
# Di.2 <- sym(matrix(rnorm(p^2), p, p))
# R <- array(rnorm(n, p, q), dim = c(n, p, q))
# F <- array(rnorm(n, k, r), dim = c(n, k, r))
# alpha <- matrix(rnorm(q * r), q, r)
# beta <- matrix(rnorm(p * k), p, k)
# all.equal({
# .R <- array(NA, dim = c(n, p, q))
# for (i in 1:n) {
# .R[i, , ] <- Di.2 %*% R[i, , ] %*% Di.1
# }
# .R
# }, {
# R %x_3% Di.1 %x_2% Di.2
# })
# all.equal({
# .Rt <- array(NA, dim = c(n, q, p))
# for (i in 1:n) {
# .Rt[i, , ] <- Di.1 %*% t(R[i, , ]) %*% Di.2
# }
# .Rt
# }, {
# .Rt <- R %x_3% Di.1 %x_2% Di.2
# aperm(.Rt, c(1, 3, 2))
# })
# all.equal({
# .Fa <- array(NA, dim = c(n, q, k))
# for (i in 1:n) {
# .Fa[i, , ] <- alpha %*% t(F[i, , ])
# }
# .Fa
# }, {
# aperm(F %x_3% alpha, c(1, 3, 2))
# })
# all.equal({
# .Fb <- array(NA, dim = c(n, p, r))
# for (i in 1:n) {
# .Fb[i, , ] <- beta %*% F[i, , ]
# }
# .Fb
# }, {
# F %x_2% beta
# })
# all.equal({
# .F <- array(NA, dim = c(n, p, q))
# for (i in 1:n) {
# .F[i, , ] <- beta %*% F[i, , ] %*% t(alpha)
# }
# .F
# }, {
# F %x_3% alpha %x_2% beta
# })
# all.equal({
# .Ga <- 0
# for (i in 1:n) {
# .Ga <- .Ga + Di.1 %*% t(R[i, , ]) %*% Di.2 %*% beta %*% F[i, , ]
# }
# .Ga
# }, {
# .R <- R %x_3% Di.1 %x_2% Di.2
# dim(.R) <- c(n * p, q)
# .Fb <- F %x_2% beta
# dim(.Fb) <- c(n * p, r)
# crossprod(.R, .Fb)
# })
# all.equal({
# .Gb <- 0
# for (i in 1:n) {
# .Gb <- .Gb + Di.2 %*% R[i, , ] %*% Di.1 %*% alpha %*% t(F[i, , ])
# }
# .Gb
# }, {
# .Rt <- aperm(R %x_3% Di.1 %x_2% Di.2, c(1, 3, 2))
# dim(.Rt) <- c(n * q, p)
# .Fa <- aperm(F %x_3% alpha, c(1, 3, 2))
# dim(.Fa) <- c(n * q, k)
# crossprod(.Rt, .Fa)
# })
# all.equal({
# .Ga <- 0
# for (i in 1:n) {
# .Ga <- .Ga + Di.1 %*% t(R[i, , ]) %*% Di.2 %*% beta %*% F[i, , ]
# }
# .Gb <- 0
# for (i in 1:n) {
# .Gb <- .Gb + Di.2 %*% R[i, , ] %*% Di.1 %*% alpha %*% t(F[i, , ])
# }
# c(.Ga, .Gb)
# }, {
# .R <- R %x_3% Di.1 %x_2% Di.2
# .Fb <- F %x_2% beta
# .Fa <- aperm(F %x_3% alpha, c(1, 3, 2))
# dim(.R) <- c(n * p, q)
# dim(.Fb) <- c(n * p, r)
# .Ga <- crossprod(.R, .Fb)
# dim(.R) <- c(n, p, q)
# .R <- aperm(.R, c(1, 3, 2))
# dim(.R) <- c(n * q, p)
# dim(.Fa) <- c(n * q, k)
# .Gb <- crossprod(.R, .Fa)
# c(.Ga, .Gb)
# })
# all.equal({
# .Ga <- 0
# for (i in 1:n) {
# .Ga <- .Ga + Di.1 %*% t(R[i, , ]) %*% Di.2 %*% beta %*% F[i, , ]
# }
# .Gb <- 0
# for (i in 1:n) {
# .Gb <- .Gb + Di.2 %*% R[i, , ] %*% Di.1 %*% alpha %*% t(F[i, , ])
# }
# c(.Ga, .Gb)
# }, {
# .R <- R %x_3% Di.1 %x_2% Di.2
# .Ga <- tcrossprod(mat(.R, 3), mat(F %x_2% beta, 3))
# .Gb <- tcrossprod(mat(.R, 2), mat(F %x_3% alpha, 2))
# c(.Ga, .Gb)
# })
# n <- 101; p <- 5; q <- 7
# sym <- function(x) crossprod(x)
# D1 <- sym(matrix(rnorm(q^2), q, q))
# D2 <- sym(matrix(rnorm(p^2), p, p))
# X <- tensorPredictors:::rmvnorm(n, sigma = kronecker(D1, D2))
# dim(X) <- c(n, p, q)
# D1.hat <- tcrossprod(mat(X, 3)) / n
# D2.hat <- tcrossprod(mat(X, 2)) / n
# local({
# par(mfrow = c(2, 2))
# matrixImage(D1, main = "D1")
# matrixImage(D1.hat, main = "D1.hat")
# matrixImage(D2, main = "D2")
# matrixImage(D2.hat, main = "D2.hat")
# })
# sum(X^2) / n
# sum(diag(D1.hat))
# sum(diag(D2.hat))
# sum(diag(kronecker(D1, D2)))
# sum(diag(kronecker(D1.hat / sqrt(sum(diag(D1.hat))),
# D2.hat / sqrt(sum(diag(D1.hat))))))
# all.equal({
# mat(X, 1) %*% kronecker(D1.hat, D2.hat)
# }, {
# mat(X %x_3% D1.hat %x_2% D2.hat, 1)
# })
# all.equal({
# C <- mat(X, 1) %*% kronecker(D1.hat, D2.hat) * (n / sum(X^2))
# dim(C) <- c(n, p, q)
# C
# }, {
# (X %x_3% D1.hat %x_2% D2.hat) / sum(diag(D1.hat))
# })
# D.1 <- tcrossprod(mat(X, 3))
# D.2 <- tcrossprod(mat(X, 2))
# tr <- sum(diag(D.1))
# D.1 <- D.1 / sqrt(n * tr)
# D.2 <- D.2 / sqrt(n * tr)
# sum(diag(kronecker(D1, D2)))
# sum(diag(kronecker(D.1, D.2)))
# det(kronecker(D1, D2))
# det(kronecker(D.1, D.2))
# det(D.1)^p * det(D.2)^q
# log(det(kronecker(D.1, D.2)))
# p * log(det(D.1)) + q * log(det(D.2))

View File

@ -0,0 +1,136 @@
#' Using unbiased (but not MLE) estimates for the Kronecker decomposed
#' covariance matrices Delta_1, Delta_2 for approximating the log-likelihood
#' giving a closed form solution for the gradient.
#'
#' Delta_1 = n^-1 sum_i R_i' R_i,
#' Delta_2 = n^-1 sum_i R_i R_i'.
#'
#' @export
kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)),
eps = .Machine$double.eps,
logger = NULL
) {
# Check if X and Fy have same number of observations
stopifnot(nrow(X) == NROW(Fy))
n <- nrow(X) # Number of observations
# Get and check predictor dimensions
if (length(dim(X)) == 2L) {
stopifnot(!missing(shape))
stopifnot(ncol(X) == prod(shape[1:2]))
p <- as.integer(shape[1]) # Predictor "height"
q <- as.integer(shape[2]) # Predictor "width"
} else if (length(dim(X)) == 3L) {
p <- dim(X)[2]
q <- dim(X)[3]
dim(X) <- c(n, p * q)
} else {
stop("'X' must be a matrix or 3-tensor")
}
# Get and check response dimensions
if (!is.array(Fy)) {
Fy <- as.array(Fy)
}
if (length(dim(Fy)) == 1L) {
k <- r <- 1L
dim(Fy) <- c(n, 1L)
} else if (length(dim(Fy)) == 2L) {
stopifnot(!missing(shape))
stopifnot(ncol(Fy) == prod(shape[3:4]))
k <- as.integer(shape[3]) # Response functional "height"
r <- as.integer(shape[4]) # Response functional "width"
} else if (length(dim(Fy)) == 3L) {
k <- dim(Fy)[2]
r <- dim(Fy)[3]
dim(Fy) <- c(n, k * r)
} else {
stop("'Fy' must be a vector, matrix or 3-tensor")
}
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
# Vectorize
dim(Fy) <- c(n, k * r)
dim(X) <- c(n, p * q)
# Solve
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
if (n <= k * r || qr(cpFy)$rank < k * r) {
# In case of under-determined system replace the inverse in the normal
# equation by the Moore-Penrose Pseudo Inverse
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
} else {
# Compute OLS estimate by the Normal Equation
B <- t(solve(cpFy, crossprod(Fy, X)))
}
# De-Vectroize (from now on tensor arithmetics)
dim(Fy) <- c(n, k, r)
dim(X) <- c(n, p, q)
# Decompose `B = alpha x beta` into `alpha` and `beta`
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
# Compute residuals
R <- X - (Fy %x_3% alpha0 %x_2% beta0)
# Covariance estimates and scaling factor
Delta.1 <- tcrossprod(mat(R, 3))
Delta.2 <- tcrossprod(mat(R, 2))
s <- sum(diag(Delta.1))
# Inverse Covariances
Delta.1.inv <- solve(Delta.1)
Delta.2.inv <- solve(Delta.2)
# cross dependent covariance estimates
S.1 <- n^-1 * tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3))
S.2 <- n^-1 * tcrossprod(mat(R, 2), mat(R %x_3% Delta.1.inv, 2))
# Evaluate negative log-likelihood (2 pi term dropped)
loss <- -0.5 * n * (p * q * log(s) - p * log(det(Delta.1)) -
q * log(det(Delta.2)) - s * sum(S.1 * Delta.1.inv))
# Gradient "generating" tensor
G <- (sum(S.1 * Delta.1.inv) - p * q / s) * R
G <- G + R %x_2% ((diag(q, p, p) - s * (Delta.2.inv %*% S.2)) %*% Delta.2.inv)
G <- G + R %x_3% ((diag(p, q, q) - s * (Delta.1.inv %*% S.1)) %*% Delta.1.inv)
G <- G + s * (R %x_2% Delta.2.inv %x_3% Delta.1.inv)
# Call history callback (logger) before the first iteration
if (is.function(logger)) {
logger(0L, loss, alpha0, beta0, Delta.1, Delta.2, NA)
}
### Step 2: MLE estimate with LS solution as starting value
a0 <- 0
a1 <- 1
alpha1 <- alpha0
beta1 <- beta0
# main descent loop
no.nesterov <- TRUE
break.reason <- NA
for (iter in seq_len(max.iter)) {
if (no.nesterov) {
# without extrapolation as fallback
alpha.moment <- alpha1
beta.moment <- beta1
} else {
# extrapolation using previous direction
alpha.moment <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0)
beta.moment <- beta1 + ((a0 - 1) / a1) * ( beta1 - beta0)
}
}
# Extrapolated residuals
R <- X - (Fy %x_3% alpha.moment %x_2% beta.moment)
list(loss = loss, alpha = alpha1, beta = beta1, Delta = Delta)
}

View File

@ -1,12 +1,50 @@
#' (Slightly altered) old implementation #' (Slightly altered) old implementation
#' #'
#' @export #' @export
kpir.base <- function(X, Fy, p, t, k = 1L, r = 1L, d1 = 1L, d2 = 1L, kpir.base <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
method = c("mle", "ls"), method = c("mle", "ls"),
eps1 = 1e-10, eps2 = 1e-10, max.iter = 500L, eps1 = 1e-10, eps2 = 1e-10, max.iter = 500L,
logger = NULL logger = NULL
) { ) {
# Check if X and Fy have same number of observations
stopifnot(nrow(X) == NROW(Fy))
n <- nrow(X) # Number of observations
# Get and check predictor dimensions
if (length(dim(X)) == 2L) {
stopifnot(!missing(shape))
stopifnot(ncol(X) == prod(shape[1:2]))
p <- as.integer(shape[1]) # Predictor "height"
q <- as.integer(shape[2]) # Predictor "width"
} else if (length(dim(X)) == 3L) {
p <- dim(X)[2]
q <- dim(X)[3]
dim(X) <- c(n, p * q)
} else {
stop("'X' must be a matrix or 3-tensor")
}
# Get and check response dimensions
if (!is.array(Fy)) {
Fy <- as.array(Fy)
}
if (length(dim(Fy)) == 1L) {
k <- r <- 1L
dim(Fy) <- c(n, 1L)
} else if (length(dim(Fy)) == 2L) {
stopifnot(!missing(shape))
stopifnot(ncol(Fy) == prod(shape[3:4]))
k <- as.integer(shape[3]) # Response functional "height"
r <- as.integer(shape[4]) # Response functional "width"
} else if (length(dim(Fy)) == 3L) {
k <- dim(Fy)[2]
r <- dim(Fy)[3]
dim(Fy) <- c(n, k * r)
} else {
stop("'Fy' must be a vector, matrix or 3-tensor")
}
log.likelihood <- function(par, X, Fy, Delta.inv, da, db) { log.likelihood <- function(par, X, Fy, Delta.inv, da, db) {
alpha <- matrix(par[1:prod(da)], da[1L]) alpha <- matrix(par[1:prod(da)], da[1L])
beta <- matrix(par[(prod(da) + 1):length(par)], db[1L]) beta <- matrix(par[(prod(da) + 1):length(par)], db[1L])
@ -17,10 +55,6 @@ kpir.base <- function(X, Fy, p, t, k = 1L, r = 1L, d1 = 1L, d2 = 1L,
# Validate method using unexact matching. # Validate method using unexact matching.
method <- match.arg(method) method <- match.arg(method)
# ## Step 1:
# # OLS estimate of the model `X = F_y B + epsilon`.
# B <- t(solve(crossprod(Fy), crossprod(Fy, X)))
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon` ### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
cpFy <- crossprod(Fy) cpFy <- crossprod(Fy)
if (n <= k * r || qr(cpFy)$rank < k * r) { if (n <= k * r || qr(cpFy)$rank < k * r) {
@ -33,7 +67,7 @@ kpir.base <- function(X, Fy, p, t, k = 1L, r = 1L, d1 = 1L, d2 = 1L,
} }
# Estimate alpha, beta as nearest kronecker approximation. # Estimate alpha, beta as nearest kronecker approximation.
c(alpha, beta) %<-% approx.kronecker(B, c(t, r), c(p, k)) c(alpha, beta) %<-% approx.kronecker(B, c(q, r), c(p, k))
if (method == "ls") { if (method == "ls") {
# Estimate Delta. # Estimate Delta.
@ -63,13 +97,13 @@ kpir.base <- function(X, Fy, p, t, k = 1L, r = 1L, d1 = 1L, d2 = 1L,
for (iter in 1:max.iter) { for (iter in 1:max.iter) {
# Optimize log-likelihood for alpha, beta with fixed Delta. # Optimize log-likelihood for alpha, beta with fixed Delta.
opt <- optim(c(alpha, beta), log.likelihood, gr = NULL, opt <- optim(c(alpha, beta), log.likelihood, gr = NULL,
X, Fy, matpow(Delta, -1), c(t, r), c(p, k)) X, Fy, matpow(Delta, -1), c(q, r), c(p, k))
# Store previous alpha, beta and Delta (for break consition). # Store previous alpha, beta and Delta (for break consition).
Delta.last <- Delta Delta.last <- Delta
B.last <- B B.last <- B
# Extract optimized alpha, beta. # Extract optimized alpha, beta.
alpha <- matrix(opt$par[1:(t * r)], t, r) alpha <- matrix(opt$par[1:(q * r)], q, r)
beta <- matrix(opt$par[(t * r + 1):length(opt$par)], p, k) beta <- matrix(opt$par[(q * r + 1):length(opt$par)], p, k)
# Calc new Delta with likelihood optimized alpha, beta. # Calc new Delta with likelihood optimized alpha, beta.
B <- kronecker(alpha, beta) B <- kronecker(alpha, beta)
resid <- X - tcrossprod(Fy, B) resid <- X - tcrossprod(Fy, B)
@ -85,9 +119,9 @@ kpir.base <- function(X, Fy, p, t, k = 1L, r = 1L, d1 = 1L, d2 = 1L,
} }
# Check break condition 1. # Check break condition 1.
if (norm(Delta - Delta.last, 'F') < eps1 * norm(Delta, 'F')) { if (norm(Delta - Delta.last, "F") < eps1 * norm(Delta, "F")) {
# Check break condition 2. # Check break condition 2.
if (norm(B - B.last, 'F') < eps2 * norm(B, 'F')) { if (norm(B - B.last, "F") < eps2 * norm(B, "F")) {
break break
} }
} }

View File

@ -65,7 +65,7 @@ kpir.kron <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
# De-Vectroize (from now on tensor arithmetics) # De-Vectroize (from now on tensor arithmetics)
dim(Fy) <- c(n, k, r) dim(Fy) <- c(n, k, r)
dim(X) <- c(n, p, q) dim(X) <- c(n, p, q)
# Decompose `B = alpha x beta` into `alpha` and `beta` # Decompose `B = alpha x beta` into `alpha` and `beta`
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k)) c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))

View File

@ -1,10 +1,10 @@
#' Gradient Descent Bases Tensor Predictors method with Nesterov Accelerated #' Gradient Descent based Tensor Predictors method with Nesterov Accelerated
#' Momentum #' Momentum
#' #'
#' @export #' @export
kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3, max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
nesterov.scaling = function(a, t) { 0.5 * (1 + sqrt(1 + (2 * a)^2)) }, nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)),
eps = .Machine$double.eps, eps = .Machine$double.eps,
logger = NULL logger = NULL
) { ) {