wip: kpir_sim,
removed: kpir kronecker (wrong) version, wip: kpir_ls
This commit is contained in:
parent
7c33cc152f
commit
49bf4bdf20
|
@ -95,3 +95,12 @@ wip/
|
||||||
|
|
||||||
# PDFs
|
# PDFs
|
||||||
*.pdf
|
*.pdf
|
||||||
|
|
||||||
|
# LaTeX (ignore everything except *.tex and *.bib files)
|
||||||
|
**/LaTeX/*
|
||||||
|
!**/LaTeX/*.tex
|
||||||
|
!**/LaTeX/*.bib
|
||||||
|
**/LaTeX/*-blx.bib
|
||||||
|
|
||||||
|
mlda_analysis/
|
||||||
|
References/
|
||||||
|
|
|
@ -87,7 +87,7 @@ We start with a brief summary of the used notation.
|
||||||
|
|
||||||
\todo{write this}
|
\todo{write this}
|
||||||
|
|
||||||
Let $\ten{A}$ be a order (rank) $r$ tensor of dimensions $p_1\times ... \times p_r$ and the matrices $\mat{B}_i$ of dimensions $q_i\times p_i$ for $i = 1, ..., r$, then
|
Let $\ten{A}$ be a multi-dimensional array of order (rank) $r$ with dimensions $p_1\times ... \times p_r$ and the matrices $\mat{B}_i$ of dimensions $q_i\times p_i$ for $i = 1, ..., r$, then
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
\ten{A} \ttm[1] \mat{B}_1 \ttm[2] \ldots \ttm[r] \mat{B}_r
|
\ten{A} \ttm[1] \mat{B}_1 \ttm[2] \ldots \ttm[r] \mat{B}_r
|
||||||
= \ten{A}\times\{ \mat{B}_1, ..., \mat{B}_r \}
|
= \ten{A}\times\{ \mat{B}_1, ..., \mat{B}_r \}
|
||||||
|
@ -110,6 +110,59 @@ Another example
|
||||||
|
|
||||||
\todo{continue}
|
\todo{continue}
|
||||||
|
|
||||||
|
\section{Tensor Normal Distribution}
|
||||||
|
Let $\ten{X}$ be a multi-dimensional array random variable of order (rank) $r$ with dimensions $p_1\times ... \times p_r$ written as
|
||||||
|
\begin{displaymath}
|
||||||
|
\ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
|
||||||
|
\end{displaymath}
|
||||||
|
Its density is given by
|
||||||
|
\begin{displaymath}
|
||||||
|
f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1}
|
||||||
|
\exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
|
||||||
|
\end{displaymath}
|
||||||
|
where $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
|
||||||
|
\begin{displaymath}
|
||||||
|
\vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
|
||||||
|
\end{displaymath}
|
||||||
|
with $p = \prod_{i = 1}^r p_i$.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\begin{theorem}[Tensor Normal to Multi-Variate Normal equivalence]
|
||||||
|
For a multi-dimensional random variable $\ten{X}$ of order $r$ with dimensions $p_1\times ..., p_r$. Let $\ten{\mu}$ be the mean of the same order and dimensions as $\ten{X}$ and the mode covariance matrices $\mat{\Delta}_i$ of dimensions $p_i\times p_i$ for $i = 1, ..., n$. Then the tensor normal distribution is equivalent to the multi-variate normal distribution by the relation
|
||||||
|
\begin{displaymath}
|
||||||
|
\ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r)
|
||||||
|
\qquad\Leftrightarrow\qquad
|
||||||
|
\vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes ...\otimes \mat{\Delta}_1)
|
||||||
|
\end{displaymath}
|
||||||
|
where $p = \prod_{i = 1}^r p_i$.
|
||||||
|
\end{theorem}
|
||||||
|
\begin{proof}
|
||||||
|
A straight forward way is to rewrite the Tensor Normal density as the density of a Multi-Variate Normal distribution depending on the vectorization of $\ten{X}$. First consider
|
||||||
|
\begin{align*}
|
||||||
|
\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle
|
||||||
|
&= \t{\vec(\ten{X} - \mu)}\vec((\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\}) \\
|
||||||
|
&= \t{\vec(\ten{X} - \mu)}(\mat{\Delta}_r^{-1}\otimes ...\otimes\mat{\Delta}_1^{-1})\vec(\ten{X} - \mu) \\
|
||||||
|
&= \t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu).
|
||||||
|
\end{align*}
|
||||||
|
Next, using a property of the determinant of a Kronecker product $|\mat{\Delta}_1\otimes\mat{\Delta}_2| = |\mat{\Delta}_1|^{p_2}|\mat{\Delta}_2|^{p_1}$ yields
|
||||||
|
\begin{displaymath}
|
||||||
|
|\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
|
||||||
|
= |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_2|^{p_1}|\mat{\Delta}_1|^{q_1}
|
||||||
|
\end{displaymath}
|
||||||
|
where $q_i = \prod_{j \neq i}p_j$. By induction over $r$ the relation
|
||||||
|
\begin{displaymath}
|
||||||
|
|\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
|
||||||
|
= \prod_{i = 1}^r |\mat{\Delta}_i|^{q_i}
|
||||||
|
\end{displaymath}
|
||||||
|
holds for arbitrary order $r$. Substituting into the Tensor Normal density leads to
|
||||||
|
\begin{align*}
|
||||||
|
f(\ten{X}) = \Big( (2\pi)^p |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1| \Big)^{-1/2}
|
||||||
|
\exp\!\left( -\frac{1}{2}\t{(\vec\ten{X} - \vec\mu)}(\mat{\Delta}_r\otimes ...\otimes\mat{\Delta}_1)^{-1}(\vec\ten{X} - \vec\mu) \right)
|
||||||
|
\end{align*}
|
||||||
|
which is the Multi-Variate Normal density of the $p$ dimensional vector $\vec\ten{X}$.
|
||||||
|
\end{proof}
|
||||||
|
|
||||||
|
|
||||||
\section{Introduction}
|
\section{Introduction}
|
||||||
We assume the model
|
We assume the model
|
||||||
|
|
|
@ -13,7 +13,7 @@ log.prog <- function(max.iter) {
|
||||||
|
|
||||||
|
|
||||||
### Exec all methods for a given data set and collect logs ###
|
### Exec all methods for a given data set and collect logs ###
|
||||||
sim <- function(X, Fy, shape, alpha.true, beta.true, max.iter = 500L) {
|
sim <- function(X, Fy, alpha.true, beta.true, max.iter = 500L) {
|
||||||
|
|
||||||
# Logger creator
|
# Logger creator
|
||||||
logger <- function(name) {
|
logger <- function(name) {
|
||||||
|
@ -26,7 +26,8 @@ sim <- function(X, Fy, shape, alpha.true, beta.true, max.iter = 500L) {
|
||||||
dist.alpha = (dist.alpha <- dist.subspace(c(alpha.true), c(alpha))),
|
dist.alpha = (dist.alpha <- dist.subspace(c(alpha.true), c(alpha))),
|
||||||
dist.beta = (dist.beta <- dist.subspace(c( beta.true), c(beta ))),
|
dist.beta = (dist.beta <- dist.subspace(c( beta.true), c(beta ))),
|
||||||
norm.alpha = norm(alpha, "F"),
|
norm.alpha = norm(alpha, "F"),
|
||||||
norm.beta = norm(beta, "F")
|
norm.beta = norm(beta, "F"),
|
||||||
|
mse = mean((X - mlm(Fy, alpha, beta, modes = 3:2))^2)
|
||||||
)
|
)
|
||||||
|
|
||||||
cat(sprintf(
|
cat(sprintf(
|
||||||
|
@ -39,36 +40,36 @@ sim <- function(X, Fy, shape, alpha.true, beta.true, max.iter = 500L) {
|
||||||
}
|
}
|
||||||
|
|
||||||
# Initialize logger history targets
|
# Initialize logger history targets
|
||||||
hist.base <- hist.new <- hist.momentum <- hist.approx <- # hist.kron <-
|
hist.base <- hist.new <- hist.momentum <- hist.approx <- hist.ls <-
|
||||||
data.frame(iter = seq(0L, max.iter),
|
data.frame(iter = seq(0L, max.iter),
|
||||||
loss = NA, dist = NA, dist.alpha = NA, dist.beta = NA,
|
loss = NA, dist = NA, dist.alpha = NA, dist.beta = NA,
|
||||||
norm.alpha = NA, norm.beta = NA
|
norm.alpha = NA, norm.beta = NA, mse = NA
|
||||||
)
|
)
|
||||||
|
|
||||||
# Base (old)
|
# Base (old)
|
||||||
kpir.base(X, Fy, shape, max.iter = max.iter, logger = logger("base"))
|
kpir.base(X, Fy, max.iter = max.iter, logger = logger("base"))
|
||||||
|
|
||||||
# New (simple Gradient Descent)
|
# New (simple Gradient Descent)
|
||||||
kpir.new(X, Fy, shape, max.iter = max.iter, logger = logger("new"))
|
kpir.new(X, Fy, max.iter = max.iter, logger = logger("new"))
|
||||||
|
|
||||||
|
# Least Squares estimate (alternating estimation)
|
||||||
|
kpir.ls(X, Fy, sample.mode = 1L, max.iter = max.iter, logger = logger("ls"))
|
||||||
|
|
||||||
# Gradient Descent with Nesterov Momentum
|
# Gradient Descent with Nesterov Momentum
|
||||||
kpir.momentum(X, Fy, shape, max.iter = max.iter, logger = logger("momentum"))
|
kpir.momentum(X, Fy, max.iter = max.iter, logger = logger("momentum"))
|
||||||
|
|
||||||
# # Residual Covariance Kronecker product assumpton version
|
|
||||||
# kpir.kron(X, Fy, shape, max.iter = max.iter, logger = logger("kron"))
|
|
||||||
|
|
||||||
# Approximated MLE with Nesterov Momentum
|
# Approximated MLE with Nesterov Momentum
|
||||||
kpir.approx(X, Fy, shape, max.iter = max.iter, logger = logger("approx"))
|
kpir.approx(X, Fy, max.iter = max.iter, logger = logger("approx"))
|
||||||
|
|
||||||
# Add method tags
|
# Add method tags
|
||||||
hist.base$method <- factor("base")
|
hist.base$method <- factor("base")
|
||||||
hist.new$method <- factor("new")
|
hist.new$method <- factor("new")
|
||||||
|
hist.ls$method <- factor("ls")
|
||||||
hist.momentum$method <- factor("momentum")
|
hist.momentum$method <- factor("momentum")
|
||||||
# hist.kron$method <- factor("kron")
|
|
||||||
hist.approx$method <- factor("approx")
|
hist.approx$method <- factor("approx")
|
||||||
|
|
||||||
# Combine results and return
|
# Combine results and return
|
||||||
rbind(hist.base, hist.new, hist.momentum, hist.approx) #, hist.kron
|
rbind(hist.base, hist.new, hist.momentum, hist.approx, hist.ls)
|
||||||
}
|
}
|
||||||
|
|
||||||
## Plot helper function
|
## Plot helper function
|
||||||
|
@ -107,10 +108,10 @@ plot.hist2 <- function(hist, response, type = "all", ...) {
|
||||||
|
|
||||||
## Generate some test data / DEBUG
|
## Generate some test data / DEBUG
|
||||||
n <- 200 # Sample Size
|
n <- 200 # Sample Size
|
||||||
p <- sample(1:15, 1) # 11
|
p <- sample(2:15, 1) # 11
|
||||||
q <- sample(1:15, 1) # 3
|
q <- sample(2:15, 1) # 7
|
||||||
k <- sample(1:15, 1) # 7
|
k <- min(sample(1:15, 1), p - 1) # 3
|
||||||
r <- sample(1:15, 1) # 5
|
r <- min(sample(1:15, 1), q - 1) # 5
|
||||||
print(c(n, p, q, k, r))
|
print(c(n, p, q, k, r))
|
||||||
|
|
||||||
hist <- NULL
|
hist <- NULL
|
||||||
|
@ -129,8 +130,10 @@ for (rep in 1:reps) {
|
||||||
))
|
))
|
||||||
Delta <- 0.5^abs(outer(seq_len(p * q), seq_len(p * q), `-`))
|
Delta <- 0.5^abs(outer(seq_len(p * q), seq_len(p * q), `-`))
|
||||||
X <- tcrossprod(Fy, kronecker(alpha, beta)) + CVarE:::rmvnorm(n, sigma = Delta)
|
X <- tcrossprod(Fy, kronecker(alpha, beta)) + CVarE:::rmvnorm(n, sigma = Delta)
|
||||||
|
dim(X) <- c(n, p, q)
|
||||||
|
dim(Fy) <- c(n, k, r)
|
||||||
|
|
||||||
hist.sim <- sim(X, Fy, shape = c(p, q, k, r), alpha.true, beta.true)
|
hist.sim <- sim(X, Fy, alpha.true, beta.true)
|
||||||
hist.sim$repetition <- rep
|
hist.sim$repetition <- rep
|
||||||
|
|
||||||
hist <- rbind(hist, hist.sim)
|
hist <- rbind(hist, hist.sim)
|
||||||
|
@ -167,22 +170,22 @@ for (response in c("loss", "dist", "dist.alpha", "dist.beta")) {
|
||||||
|
|
||||||
n <- 200 # Sample Size
|
n <- 200 # Sample Size
|
||||||
p <- 11 # sample(1:15, 1)
|
p <- 11 # sample(1:15, 1)
|
||||||
q <- 3 # sample(1:15, 1)
|
q <- 7 # sample(1:15, 1)
|
||||||
k <- 7 # sample(1:15, 1)
|
k <- 3 # sample(1:15, 1)
|
||||||
r <- 5 # sample(1:15, 1)
|
r <- 5 # sample(1:15, 1)
|
||||||
print(c(n, p, q, k, r))
|
print(c(n, p, q, k, r))
|
||||||
|
|
||||||
hist <- NULL
|
hist <- NULL
|
||||||
reps <- 20
|
reps <- 20
|
||||||
|
max.iter <- 2
|
||||||
|
|
||||||
Delta.1 <- sqrt(0.5)^abs(outer(seq_len(q), seq_len(q), `-`))
|
Delta.1 <- sqrt(0.5)^abs(outer(seq_len(p), seq_len(p), `-`))
|
||||||
Delta.2 <- sqrt(0.5)^abs(outer(seq_len(p), seq_len(p), `-`))
|
Delta.2 <- sqrt(0.5)^abs(outer(seq_len(q), seq_len(q), `-`))
|
||||||
Delta <- kronecker(Delta.1, Delta.2)
|
|
||||||
for (rep in 1:reps) {
|
for (rep in 1:reps) {
|
||||||
cat(sprintf("%4d / %d simulation rep. started\n", rep, reps))
|
cat(sprintf("%4d / %d simulation rep. started\n", rep, reps))
|
||||||
|
|
||||||
alpha.true <- alpha <- matrix(rnorm(q * r), q, r)
|
alpha.1.true <- alpha.1 <- matrix(rnorm(q * r), q, r)
|
||||||
beta.true <- beta <- matrix(rnorm(p * k), p, k)
|
alpha.2.true <- alpha.2 <- matrix(rnorm(p * k), p, k)
|
||||||
y <- rnorm(n)
|
y <- rnorm(n)
|
||||||
Fy <- do.call(cbind, Map(function(slope, offset) {
|
Fy <- do.call(cbind, Map(function(slope, offset) {
|
||||||
sin(slope * y + offset)
|
sin(slope * y + offset)
|
||||||
|
@ -190,15 +193,16 @@ for (rep in 1:reps) {
|
||||||
head(rep(seq(1, ceiling(0.5 * k * r)), each = 2), k * r),
|
head(rep(seq(1, ceiling(0.5 * k * r)), each = 2), k * r),
|
||||||
head(rep(c(0, pi / 2), ceiling(0.5 * k * r)), k * r)
|
head(rep(c(0, pi / 2), ceiling(0.5 * k * r)), k * r)
|
||||||
))
|
))
|
||||||
X <- tcrossprod(Fy, kronecker(alpha, beta)) + CVarE:::rmvnorm(n, sigma = Delta)
|
dim(Fy) <- c(n, k, r)
|
||||||
|
X <- mlm(Fy, alpha, beta, modes = 3:2)
|
||||||
|
X <- X + rtensornorm(n, 0, Delta.1, Delta.2, sample.mode = 1L)
|
||||||
|
|
||||||
hist.sim <- sim(X, Fy, shape = c(p, q, k, r), alpha.true, beta.true)
|
hist.sim <- sim(X, Fy, alpha.true, beta.true, max.iter = max.iter)
|
||||||
hist.sim$repetition <- rep
|
hist.sim$repetition <- rep
|
||||||
|
|
||||||
hist <- rbind(hist, hist.sim)
|
hist <- rbind(hist, hist.sim)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# Save simulation results
|
# Save simulation results
|
||||||
sim.name <- "sim02"
|
sim.name <- "sim02"
|
||||||
datetime <- format(Sys.time(), "%Y%m%dT%H%M")
|
datetime <- format(Sys.time(), "%Y%m%dT%H%M")
|
||||||
|
@ -207,14 +211,77 @@ saveRDS(hist, file = sprintf("%s_%s.rds", sim.name, datetime))
|
||||||
# for GGPlot2, as factors for grouping
|
# for GGPlot2, as factors for grouping
|
||||||
hist$repetition <- factor(hist$repetition)
|
hist$repetition <- factor(hist$repetition)
|
||||||
|
|
||||||
for (response in c("loss", "dist", "dist.alpha", "dist.beta")) {
|
for (response in c("loss", "mse", "dist", "dist.alpha", "dist.beta")) {
|
||||||
for (fun in c("all", "mean", "median")) {
|
for (fun in c("all", "mean", "median")) {
|
||||||
print(plot.hist2(hist, response, fun, title = fun) + coord_trans(x = "log1p"))
|
print(plot.hist2(hist, response, fun, title = fun) + coord_trans(x = "log1p"))
|
||||||
dev.print(png, file = sprintf("%s_%s_%s_%s.png", sim.name, datetime, response, fun),
|
dev.print(png, file = sprintf("%s_%s_%s_%s.png", sim.name, datetime, response, fun),
|
||||||
width = 768, height = 768, res = 125)
|
width = 768, height = 768, res = 125)
|
||||||
|
if (response != "loss") {
|
||||||
|
print(plot.hist2(hist, response, fun, title = fun) + coord_trans(x = "log1p", y = "log1p"))
|
||||||
|
dev.print(png, file = sprintf("%s_%s_%s_%s_log.png", sim.name, datetime, response, fun),
|
||||||
|
width = 768, height = 768, res = 125)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Sim 3 ###
|
||||||
|
################################################################################
|
||||||
|
n <- 200
|
||||||
|
p <- c(7, 11, 5) # response dimensions (order 3)
|
||||||
|
q <- c(3, 6, 2) # predictor dimensions (order 3)
|
||||||
|
|
||||||
|
# currently only kpir.ls suppoert higher orders (order > 2)
|
||||||
|
sim3 <- function(X, Fy, alphas.true, max.iter = 500L) {
|
||||||
|
|
||||||
|
# Logger creator
|
||||||
|
logger <- function(name) {
|
||||||
|
eval(substitute(function(iter, loss, alpha, beta, ...) {
|
||||||
|
hist[iter + 1L, ] <<- c(
|
||||||
|
iter = iter,
|
||||||
|
loss = loss,
|
||||||
|
mse = (mse <- mean((X - mlm(Fy, alpha, beta, modes = 3:2))^2)),
|
||||||
|
(dist <- unlist(Map(dist.subspace, alphas, alphas.true)))
|
||||||
|
)
|
||||||
|
|
||||||
|
cat(sprintf(
|
||||||
|
"%s(%3d) | loss: %-12.4f - mse: %-12.4f - sum(dist): %-.4e\n",
|
||||||
|
name, iter, loss, sum(dist)
|
||||||
|
))
|
||||||
|
}, list(hist = as.symbol(paste0("hist.", name)))))
|
||||||
|
}
|
||||||
|
|
||||||
|
# Initialize logger history targets
|
||||||
|
hist.ls <-
|
||||||
|
do.call(data.frame, c(list(
|
||||||
|
iter = seq(0, r), loss = NA, mse = NA),
|
||||||
|
dist = rep(NA, length(dim(X)) - 1L)
|
||||||
|
))
|
||||||
|
|
||||||
|
# Approximated MLE with Nesterov Momentum
|
||||||
|
kpir.ls(X, Fy, sample.mode = 1L, max.iter = max.iter, logger = logger("ls"))
|
||||||
|
|
||||||
|
# Add method tags
|
||||||
|
hist.ls$method <- factor("ls")
|
||||||
|
|
||||||
|
# # Combine results and return
|
||||||
|
# rbind(hist.base, hist.new, hist.momentum, hist.approx, hist.ls)
|
||||||
|
hist.ls
|
||||||
|
}
|
||||||
|
|
||||||
|
sample.data3 <- function(n, p, q) {
|
||||||
|
stopifnot(length(p) == length(q))
|
||||||
|
stopifnot(all(q <= p))
|
||||||
|
|
||||||
|
Deltas <- Map(function(nrow) {
|
||||||
|
|
||||||
|
}, p)
|
||||||
|
|
||||||
|
list(X, Fy, alphas, Deltas)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
### WIP ###
|
### WIP ###
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
|
@ -1,3 +1,26 @@
|
||||||
|
################################################################################
|
||||||
|
### Sparce SIR against SIR ###
|
||||||
|
################################################################################
|
||||||
|
devtools::load_all('tensorPredictors/')
|
||||||
|
library(dr)
|
||||||
|
|
||||||
|
n <- 100
|
||||||
|
p <- 10
|
||||||
|
|
||||||
|
X <- rmvnorm(n, sigma = 0.5^abs(outer(1:p, 1:p, `-`)))
|
||||||
|
y <- rowSums(X[, 1:3]) + rnorm(n, 0.5)
|
||||||
|
B <- as.matrix(1:p <= 3) / sqrt(3)
|
||||||
|
|
||||||
|
dr.sir <- dr(y ~ X, method = 'sir', numdir = ncol(B))
|
||||||
|
B.sir <- dr.sir$evectors[, seq_len(ncol(B)), drop = FALSE]
|
||||||
|
|
||||||
|
dist.projection(B, B.sir)
|
||||||
|
|
||||||
|
B.poi <- with(GEP(X, y, 'sir'), {
|
||||||
|
POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = TRUE)$vectors
|
||||||
|
})
|
||||||
|
|
||||||
|
dist.projection(B, B.poi)
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
### LDA (sparse Linear Discrimina Analysis) ###
|
### LDA (sparse Linear Discrimina Analysis) ###
|
||||||
|
@ -69,19 +92,19 @@ dataset <- function(nr) {
|
||||||
list(X = X, y = y, B = qr.Q(qr(B)))
|
list(X = X, y = y, B = qr.Q(qr(B)))
|
||||||
}
|
}
|
||||||
|
|
||||||
# # Model 1
|
# Model 1
|
||||||
# fit <- with(dataset(1), {
|
fit <- with(dataset(1), {
|
||||||
# with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C'))
|
with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C'))
|
||||||
# })
|
})
|
||||||
# fit <- with(dataset(1), {
|
fit <- with(dataset(1), {
|
||||||
# with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C'))
|
with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C'))
|
||||||
# })
|
})
|
||||||
# fit <- with(dataset(1), {
|
fit <- with(dataset(1), {
|
||||||
# with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = TRUE))
|
with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = TRUE))
|
||||||
# })
|
})
|
||||||
# fit <- with(dataset(1), {
|
fit <- with(dataset(1), {
|
||||||
# with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C', use.C = TRUE))
|
with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C', use.C = TRUE))
|
||||||
# })
|
})
|
||||||
|
|
||||||
# head(fit$vectors, 10)
|
# head(fit$vectors, 10)
|
||||||
|
|
||||||
|
@ -90,19 +113,24 @@ nr.reps <- 100
|
||||||
sim <- replicate(nr.reps, {
|
sim <- replicate(nr.reps, {
|
||||||
res <- double(0)
|
res <- double(0)
|
||||||
for (model.nr in 1:5) {
|
for (model.nr in 1:5) {
|
||||||
for (method in c('POI-C', 'FastPOI-C')) {
|
with(dataset(model.nr), {
|
||||||
for (use.C in c(FALSE, TRUE)) {
|
for (method in c('POI-C', 'FastPOI-C')) {
|
||||||
dist <- with(dataset(model.nr), {
|
for (use.C in c(FALSE, TRUE)) {
|
||||||
fit <- with(GEP(X, y, 'lda'), {
|
fit <- with(GEP(X, y, 'lda'), {
|
||||||
POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = use.C)
|
POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = use.C)
|
||||||
})
|
})
|
||||||
# dist.subspace(B, fit$vectors, is.ortho = FALSE, normalize = TRUE)
|
dist <- dist.projection(B, fit$vectors)
|
||||||
dist.projection(B, fit$vectors)
|
names(dist) <- paste('M', model.nr, '-', method, '-', use.C)
|
||||||
})
|
res <<- c(res, dist)
|
||||||
names(dist) <- paste('M', model.nr, '-', method, '-', use.C)
|
}
|
||||||
res <- c(res, dist)
|
|
||||||
}
|
}
|
||||||
}
|
fit <- with(GEP(X, y, 'lda'), {
|
||||||
|
solve.gep(lhs, rhs, ncol(B))
|
||||||
|
})
|
||||||
|
dist <- dist.projection(B, fit$vectors)
|
||||||
|
names(dist) <- paste('M', model.nr, '- solve -', use.C)
|
||||||
|
res <<- c(res, dist)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
cat("Counter", (count <<- count + 1), "/", nr.reps, "\n")
|
cat("Counter", (count <<- count + 1), "/", nr.reps, "\n")
|
||||||
res
|
res
|
||||||
|
|
|
@ -16,12 +16,14 @@ export(dist.subspace)
|
||||||
export(kpir.approx)
|
export(kpir.approx)
|
||||||
export(kpir.base)
|
export(kpir.base)
|
||||||
export(kpir.kron)
|
export(kpir.kron)
|
||||||
|
export(kpir.ls)
|
||||||
export(kpir.momentum)
|
export(kpir.momentum)
|
||||||
export(kpir.new)
|
export(kpir.new)
|
||||||
export(mat)
|
export(mat)
|
||||||
export(matpow)
|
export(matpow)
|
||||||
export(matrixImage)
|
export(matrixImage)
|
||||||
export(mcrossprod)
|
export(mcrossprod)
|
||||||
|
export(mlm)
|
||||||
export(reduce)
|
export(reduce)
|
||||||
export(rowKronecker)
|
export(rowKronecker)
|
||||||
export(rtensornorm)
|
export(rtensornorm)
|
||||||
|
|
|
@ -1,216 +0,0 @@
|
||||||
#' Gradient Descent Bases Tensor Predictors method with Nesterov Accelerated
|
|
||||||
#' Momentum and Kronecker structure assumption for the residual covariance
|
|
||||||
#' `Delta = Delta.1 %x% Delta.2` (simple plugin version!)
|
|
||||||
#'
|
|
||||||
#' @export
|
|
||||||
kpir.kron <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
|
||||||
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
|
|
||||||
nesterov.scaling = function(a, t) { 0.5 * (1 + sqrt(1 + (2 * a)^2)) },
|
|
||||||
eps = .Machine$double.eps,
|
|
||||||
logger = NULL
|
|
||||||
) {
|
|
||||||
|
|
||||||
# Check if X and Fy have same number of observations
|
|
||||||
stopifnot(nrow(X) == NROW(Fy))
|
|
||||||
n <- nrow(X) # Number of observations
|
|
||||||
|
|
||||||
# Get and check predictor dimensions (convert to 3-tensor if needed)
|
|
||||||
if (length(dim(X)) == 2L) {
|
|
||||||
stopifnot(!missing(shape))
|
|
||||||
stopifnot(ncol(X) == prod(shape[1:2]))
|
|
||||||
p <- as.integer(shape[1]) # Predictor "height"
|
|
||||||
q <- as.integer(shape[2]) # Predictor "width"
|
|
||||||
} else if (length(dim(X)) == 3L) {
|
|
||||||
p <- dim(X)[2]
|
|
||||||
q <- dim(X)[3]
|
|
||||||
} else {
|
|
||||||
stop("'X' must be a matrix or 3-tensor")
|
|
||||||
}
|
|
||||||
|
|
||||||
# Get and check response dimensions (and convert to 3-tensor if needed)
|
|
||||||
if (!is.array(Fy)) {
|
|
||||||
Fy <- as.array(Fy)
|
|
||||||
}
|
|
||||||
if (length(dim(Fy)) == 1L) {
|
|
||||||
k <- r <- 1L
|
|
||||||
dim(Fy) <- c(n, 1L, 1L)
|
|
||||||
} else if (length(dim(Fy)) == 2L) {
|
|
||||||
stopifnot(!missing(shape))
|
|
||||||
stopifnot(ncol(Fy) == prod(shape[3:4]))
|
|
||||||
k <- as.integer(shape[3]) # Response functional "height"
|
|
||||||
r <- as.integer(shape[4]) # Response functional "width"
|
|
||||||
} else if (length(dim(Fy)) == 3L) {
|
|
||||||
k <- dim(Fy)[2]
|
|
||||||
r <- dim(Fy)[3]
|
|
||||||
} else {
|
|
||||||
stop("'Fy' must be a vector, matrix or 3-tensor")
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
|
|
||||||
# Vectorize
|
|
||||||
dim(Fy) <- c(n, k * r)
|
|
||||||
dim(X) <- c(n, p * q)
|
|
||||||
# Solve
|
|
||||||
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
|
|
||||||
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
|
||||||
# In case of under-determined system replace the inverse in the normal
|
|
||||||
# equation by the Moore-Penrose Pseudo Inverse
|
|
||||||
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
|
|
||||||
} else {
|
|
||||||
# Compute OLS estimate by the Normal Equation
|
|
||||||
B <- t(solve(cpFy, crossprod(Fy, X)))
|
|
||||||
}
|
|
||||||
|
|
||||||
# De-Vectroize (from now on tensor arithmetics)
|
|
||||||
dim(Fy) <- c(n, k, r)
|
|
||||||
dim(X) <- c(n, p, q)
|
|
||||||
|
|
||||||
# Decompose `B = alpha x beta` into `alpha` and `beta`
|
|
||||||
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
|
|
||||||
|
|
||||||
# Compute residuals
|
|
||||||
resid <- X - (Fy %x_3% alpha0 %x_2% beta0)
|
|
||||||
|
|
||||||
# Covariance estimate
|
|
||||||
Delta.1 <- tcrossprod(mat(resid, 3))
|
|
||||||
Delta.2 <- tcrossprod(mat(resid, 2))
|
|
||||||
tr <- sum(diag(Delta.1))
|
|
||||||
Delta.1 <- Delta.1 / sqrt(n * tr)
|
|
||||||
Delta.2 <- Delta.2 / sqrt(n * tr)
|
|
||||||
|
|
||||||
# Transformed Residuals
|
|
||||||
resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2)
|
|
||||||
|
|
||||||
# Evaluate negative log-likelihood
|
|
||||||
loss <- 0.5 * (n * (p * log(det(Delta.1)) + q * log(det(Delta.2))) +
|
|
||||||
sum(resid.trans * resid))
|
|
||||||
|
|
||||||
# Call history callback (logger) before the first iterate
|
|
||||||
if (is.function(logger)) {
|
|
||||||
logger(0L, loss, alpha0, beta0, Delta.1, Delta.2, NA)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
### Step 2: MLE with LS solution as starting value
|
|
||||||
a0 <- 0
|
|
||||||
a1 <- 1
|
|
||||||
alpha1 <- alpha0
|
|
||||||
beta1 <- beta0
|
|
||||||
|
|
||||||
# main descent loop
|
|
||||||
no.nesterov <- TRUE
|
|
||||||
break.reason <- NA
|
|
||||||
for (iter in seq_len(max.iter)) {
|
|
||||||
if (no.nesterov) {
|
|
||||||
# without extrapolation as fallback
|
|
||||||
S.alpha <- alpha1
|
|
||||||
S.beta <- beta1
|
|
||||||
} else {
|
|
||||||
# extrapolation using previous direction
|
|
||||||
S.alpha <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0)
|
|
||||||
S.beta <- beta1 + ((a0 - 1) / a1) * ( beta1 - beta0)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Extrapolated Residuals
|
|
||||||
resid <- X - (Fy %x_3% S.alpha %x_2% S.beta)
|
|
||||||
|
|
||||||
# Covariance Estimates
|
|
||||||
Delta.1 <- tcrossprod(mat(resid, 3))
|
|
||||||
Delta.2 <- tcrossprod(mat(resid, 2))
|
|
||||||
tr <- sum(diag(Delta.1))
|
|
||||||
Delta.1 <- Delta.1 / sqrt(n * tr)
|
|
||||||
Delta.2 <- Delta.2 / sqrt(n * tr)
|
|
||||||
|
|
||||||
# Transform Residuals
|
|
||||||
resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2)
|
|
||||||
|
|
||||||
# Calculate Gradients
|
|
||||||
grad.alpha <- tcrossprod(mat(resid.trans, 3), mat(Fy %x_2% S.beta, 3))
|
|
||||||
grad.beta <- tcrossprod(mat(resid.trans, 2), mat(Fy %x_3% S.alpha, 2))
|
|
||||||
|
|
||||||
# Backtracking line search (Armijo type)
|
|
||||||
# The `inner.prod` is used in the Armijo break condition but does not
|
|
||||||
# depend on the step size.
|
|
||||||
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
|
|
||||||
|
|
||||||
# backtracking loop
|
|
||||||
for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) {
|
|
||||||
# Update `alpha` and `beta` (note: add(+), the gradients are already
|
|
||||||
# pointing into the negative slope direction of the loss cause they are
|
|
||||||
# the gradients of the log-likelihood [NOT the negative log-likelihood])
|
|
||||||
alpha.temp <- S.alpha + delta * grad.alpha
|
|
||||||
beta.temp <- S.beta + delta * grad.beta
|
|
||||||
|
|
||||||
# Update Residuals, Covariance and transformed Residuals
|
|
||||||
resid <- X - (Fy %x_3% alpha.temp %x_2% beta.temp)
|
|
||||||
Delta.1 <- tcrossprod(mat(resid, 3))
|
|
||||||
Delta.2 <- tcrossprod(mat(resid, 2))
|
|
||||||
tr <- sum(diag(Delta.1))
|
|
||||||
Delta.1 <- Delta.1 / sqrt(n * tr)
|
|
||||||
Delta.2 <- Delta.2 / sqrt(n * tr)
|
|
||||||
resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2)
|
|
||||||
|
|
||||||
# Evaluate negative log-likelihood
|
|
||||||
loss.temp <- 0.5 * (n * (p * log(det(Delta.1)) + q * log(det(Delta.2)))
|
|
||||||
+ sum(resid.trans * resid))
|
|
||||||
|
|
||||||
# Armijo line search break condition
|
|
||||||
if (loss.temp <= loss - 0.1 * delta * inner.prod) {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Call logger (invoke history callback)
|
|
||||||
if (is.function(logger)) {
|
|
||||||
logger(iter, loss.temp, alpha.temp, beta.temp, Delta.1, Delta.2, delta)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Ensure descent
|
|
||||||
if (loss.temp < loss) {
|
|
||||||
alpha0 <- alpha1
|
|
||||||
alpha1 <- alpha.temp
|
|
||||||
beta0 <- beta1
|
|
||||||
beta1 <- beta.temp
|
|
||||||
|
|
||||||
# check break conditions (in descent case)
|
|
||||||
if (mean(abs(alpha1)) + mean(abs(beta1)) < eps) {
|
|
||||||
break.reason <- "alpha, beta numerically zero"
|
|
||||||
break # basically, estimates are zero -> stop
|
|
||||||
}
|
|
||||||
if (inner.prod < eps * (p * q + r * k)) {
|
|
||||||
break.reason <- "mean squared gradient is smaller than epsilon"
|
|
||||||
break # mean squared gradient is smaller than epsilon -> stop
|
|
||||||
}
|
|
||||||
if (abs(loss.temp - loss) < eps) {
|
|
||||||
break.reason <- "decrease is too small (slow)"
|
|
||||||
break # decrease is too small (slow) -> stop
|
|
||||||
}
|
|
||||||
|
|
||||||
loss <- loss.temp
|
|
||||||
no.nesterov <- FALSE # always reset
|
|
||||||
} else if (!no.nesterov) {
|
|
||||||
no.nesterov <- TRUE # retry without momentum
|
|
||||||
next
|
|
||||||
} else {
|
|
||||||
break.reason <- "failed even without momentum"
|
|
||||||
break # failed even without momentum -> stop
|
|
||||||
}
|
|
||||||
|
|
||||||
# update momentum scaling
|
|
||||||
a0 <- a1
|
|
||||||
a1 <- nesterov.scaling(a1, iter)
|
|
||||||
|
|
||||||
# Set next iter starting step.size to line searched step size
|
|
||||||
# (while allowing it to encrease)
|
|
||||||
step.size <- 1.618034 * delta
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
list(
|
|
||||||
loss = loss,
|
|
||||||
alpha = alpha1, beta = beta1,
|
|
||||||
Delta.1 = Delta.1, Delta.2 = Delta.2,
|
|
||||||
break.reason = break.reason
|
|
||||||
)
|
|
||||||
}
|
|
|
@ -17,19 +17,28 @@ kpir.ls <- function(X, Fy, max.iter = 20L, sample.mode = 1L,
|
||||||
} else {
|
} else {
|
||||||
stopifnot(dim(X)[sample.mode] == dim(Fy)[sample.mode])
|
stopifnot(dim(X)[sample.mode] == dim(Fy)[sample.mode])
|
||||||
}
|
}
|
||||||
# and check shape
|
# Check dimensions
|
||||||
stopifnot(length(X) == length(Fy))
|
stopifnot(length(dim(X)) == length(dim(Fy)))
|
||||||
|
stopifnot(dim(X)[sample.mode] == dim(Fy)[sample.mode])
|
||||||
|
# and model constraints
|
||||||
|
stopifnot(all(dim(Fy) <= dim(X)))
|
||||||
|
|
||||||
# mode index sequence (exclude sample mode, a.k.a. observation axis)
|
# mode index sequence (exclude sample mode, a.k.a. observation axis)
|
||||||
modes <- seq_along(dim(X))[-sample.mode]
|
modes <- seq_along(dim(X))[-sample.mode]
|
||||||
|
|
||||||
|
|
||||||
### Step 1: initial per mode estimates
|
### Step 1: initial per mode estimates
|
||||||
alphas <- Map(function(mode, ncol) {
|
alphas <- Map(function(mode, ncol) {
|
||||||
La.svd(mcrossprod(X, mode), ncol)$u
|
La.svd(mcrossprod(X, mode = mode), ncol)$u
|
||||||
}, modes, dim(Fy)[modes])
|
}, modes, dim(Fy)[modes])
|
||||||
|
|
||||||
|
# # Scaling of alpha, such that `tr(alpha_i' alpha_i) = tr(alpha_j' alpha_j)``
|
||||||
|
# # for `i, j = 1, ..., r`.
|
||||||
|
# traces <- unlist(Map(function(alpha) sum(alpha^2)))
|
||||||
|
# alphas <- Map(`*`, prod(traces)^(1 / length(alphas)) / traces, alphas)
|
||||||
|
|
||||||
# Call history callback (logger) before the first iteration
|
# Call history callback (logger) before the first iteration
|
||||||
if (is.function(logger)) { logger(0L, alphas) }
|
if (is.function(logger)) { do.call(logger, c(0L, NA, rev(alphas))) }
|
||||||
|
|
||||||
|
|
||||||
### Step 2: iterate per mode (axis) least squares estimates
|
### Step 2: iterate per mode (axis) least squares estimates
|
||||||
|
@ -38,16 +47,31 @@ kpir.ls <- function(X, Fy, max.iter = 20L, sample.mode = 1L,
|
||||||
for (j in seq_along(modes)) {
|
for (j in seq_along(modes)) {
|
||||||
# least squares solution for `alpha_j | alpha_i, i != j`
|
# least squares solution for `alpha_j | alpha_i, i != j`
|
||||||
Z <- mlm(Fy, alphas[-j], modes = modes[-j])
|
Z <- mlm(Fy, alphas[-j], modes = modes[-j])
|
||||||
alphas[[j]] <- t(solve(mcrossprod(Z, j), tcrossprod(mat(Z, j), mat(X, j))))
|
alphas[[j]] <- t(solve(mcrossprod(Z, mode = modes[j]),
|
||||||
|
tcrossprod(mat(Z, modes[j]), mat(X, modes[j]))))
|
||||||
# TODO: alphas[[j]] <- t(solve(mcrossprod(Z, j), mcrossprod(Z, X, j)))
|
# TODO: alphas[[j]] <- t(solve(mcrossprod(Z, j), mcrossprod(Z, X, j)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# # Scaling of alpha, such that `tr(alpha_i' alpha_i) = tr(alpha_j' alpha_j)``
|
||||||
|
# # for `i, j = 1, ..., r`.
|
||||||
|
# traces <- unlist(Map(function(alpha) sum(alpha^2)))
|
||||||
|
# alphas <- Map(`*`, prod(traces)^(1 / length(alphas)) / traces, alphas)
|
||||||
|
|
||||||
# Call logger (invoke history callback)
|
# Call logger (invoke history callback)
|
||||||
if (is.function(logger)) { logger(iter, alphas) }
|
if (is.function(logger)) { do.call(logger, c(iter, NA, rev(alphas))) }
|
||||||
|
|
||||||
# TODO: add some kind of break condition
|
# TODO: add some kind of break condition
|
||||||
}
|
}
|
||||||
|
|
||||||
|
### Step 3: Moment estimates for `Delta_i`
|
||||||
|
# Residuals
|
||||||
|
R <- X - mlm(Fy, alphas, modes = modes)
|
||||||
|
# Moment estimates for `Delta_i`s
|
||||||
|
Deltas <- Map(mcrossprod, list(R), mode = modes)
|
||||||
|
Deltas <- Map(`*`, 1 / dim(X)[sample.mode], Deltas)
|
||||||
|
|
||||||
|
list(
|
||||||
|
alphas = structure(alphas, names = as.character(modes)),
|
||||||
|
Deltas = structure(Deltas, names = as.character(modes))
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue