From b8064c90efc5f445c7ecaa8a078cbe9405d4dc93 Mon Sep 17 00:00:00 2001 From: daniel Date: Fri, 6 May 2022 22:27:24 +0200 Subject: [PATCH] add: rtensornorm (sample tensor normal), add: mcrossprod (mode crossprod), wip: simulations, add: notes on multi-array (tensor) normal sampling, wip: initial estimates for alpha, beta, fix: scaling in kpir_approx --- LaTeX/main.tex | 97 ++++++++++- simulations/kpir_sim.R | 264 +++++++++++++++--------------- tensorPredictors/R/kpir_approx.R | 28 ++-- tensorPredictors/R/mcrossprod.R | 21 +++ tensorPredictors/R/rtensornorm.R | 49 ++++++ tensorPredictors/src/mcrossprod.c | 88 ++++++++++ 6 files changed, 401 insertions(+), 146 deletions(-) create mode 100644 tensorPredictors/R/mcrossprod.R create mode 100644 tensorPredictors/R/rtensornorm.R create mode 100644 tensorPredictors/src/mcrossprod.c diff --git a/LaTeX/main.tex b/LaTeX/main.tex index 705b433..98b396f 100644 --- a/LaTeX/main.tex +++ b/LaTeX/main.tex @@ -66,6 +66,15 @@ \newcommand{\pinv}[1]{{{#1}^{\dagger}}} % `Moore-Penrose pseudoinverse` \newcommand{\todo}[1]{{\color{red}TODO: #1}} +% \DeclareFontFamily{U}{mathx}{\hyphenchar\font45} +% \DeclareFontShape{U}{mathx}{m}{n}{ +% <5> <6> <7> <8> <9> <10> +% <10.95> <12> <14.4> <17.28> <20.74> <24.88> +% mathx10 +% }{} +% \DeclareSymbolFont{mathx}{U}{mathx}{m}{n} +% \DeclareMathSymbol{\bigtimes}{1}{mathx}{"91} + \begin{document} \maketitle @@ -73,6 +82,35 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Introduction %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Notation} +We start with a brief summary of the used notation. + +\todo{write this} + +Let $\ten{A}$ be a order (rank) $r$ tensor of dimensions $p_1\times ... \times p_r$ and the matrices $\mat{B}_i$ of dimensions $q_i\times p_i$ for $i = 1, ..., r$, then +\begin{displaymath} + \ten{A} \ttm[1] \mat{B}_1 \ttm[2] \ldots \ttm[r] \mat{B}_r + = \ten{A}\times\{ \mat{B}_1, ..., \mat{B}_r \} + = \ten{A}\times_{i\in[r]} \mat{B}_i + = (\ten{A}\times_{i\in[r]\backslash j} \mat{B}_i)\ttm[j]\mat{B}_j +\end{displaymath} +As an alternative example consider +\begin{displaymath} + \ten{A}\times_2\mat{B}_2\times_3\mat{B}_3 = \ten{A}\times\{ \mat{I}, \mat{B}_2, \mat{B}_3 \} = \ten{A}\times_{i\in\{2, 3\}}\mat{B}_i +\end{displaymath} +Another example +\begin{displaymath} + \mat{B}\mat{A}\t{\mat{C}} = \mat{A}\times_1\mat{B}\times_2\mat{C} + = \mat{A}\times\{\mat{B}, \mat{C}\} +\end{displaymath} + +\begin{displaymath} + (\ten{A}\ttm[i]\mat{B})_{(i)} = \mat{B}\ten{A}_{(i)} +\end{displaymath} + +\todo{continue} + + \section{Introduction} We assume the model \begin{displaymath} @@ -415,15 +453,45 @@ This leads to the following form of the differential of $\tilde{l}$ given by and therefore the gradients \begin{align*} \nabla_{\mat{\alpha}}\tilde{l}(\mat{\alpha}, \mat{\beta}) &= \sum_{i = 1}^n \t{\mat{G}_i}\mat{\beta}\mat{f}_{y_i} - = \ten{G}_{(3)}\t{(\ten{F}\ttm[2]\beta)_{(3)}}, \\ + = \ten{G}_{(3)}\t{(\ten{F}\ttm[2]\mat{\beta})_{(3)}}, \\ \nabla_{\mat{\beta}} \tilde{l}(\mat{\alpha}, \mat{\beta}) &= \sum_{i = 1}^n \mat{G}_i\mat{\alpha}\t{\mat{f}_{y_i}} - = \ten{G}_{(2)}\t{(\ten{F}\ttm[3]\beta)_{(2)}}. + = \ten{G}_{(2)}\t{(\ten{F}\ttm[3]\mat{\alpha})_{(2)}}. \end{align*} \todo{check the tensor version of the gradient!!!} \newpage +\section{Thoughts on initial value estimation} +\todo{This section uses an alternative notation as it already tries to generalize to general multi-dimensional arrays. Furthermore, one of the main differences is that the observation are indexed in the \emph{last} mode. The benefit of this is that the mode product and parameter matrix indices match not only in the population model but also in sample versions.} +Let $\ten{X}, \ten{F}$ be order (rank) $r$ tensors of dimensions $p_1\times ... \times p_r$ and $q_1\times ... \times q_r$, respectively. Also denote the error tensor $\epsilon$ of the same order and dimensions as $\ten{X}$. The considered model for the $i$'th observation is +\begin{displaymath} + \ten{X}_i = \ten{\mu} + \ten{F}_i\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}_i +\end{displaymath} +where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}\sim\mathcal{TM}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations +\begin{displaymath} + \ten{X} = \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon} +\end{displaymath} +which is almost identical as the observations $\ten{X}_i, \ten{F}_i$ are stacked on an addition $r + 1$ mode leading to response, predictor and error tensors $\ten{X}, \ten{F}$ of order (rank) $r + 1$ and dimensions $p_1\times...\times p_r\times n$ for $\ten{X}, \ten{\epsilon}$ and $q_1\times...\times q_r\times n$ for $\ten{F}$. + +In the following we assume w.l.o.g that $\ten{\mu} = 0$, as if this is not true we simply replace $\ten{X}_i$ with $\ten{X}_i - \ten{\mu}$ for $i = 1, ..., n$ before collecting all the observations in the response tensor $\ten{X}$. + +The goal here is to find reasonable estimates for $\mat{\alpha}_i$, $i = 1, ..., n$ for the mean model +\begin{displaymath} + \E \ten{X}|\ten{F}, \mat{\alpha}_1, ..., \mat{\alpha}_r = \ten{F}\times\{\mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n\} + = \ten{F}\times_{i\in[r]}\mat{\alpha}_i. +\end{displaymath} +Under the mean model we have using the general mode product relation $(\ten{A}\times_j\mat{B})_{(j)} = \mat{B}\ten{A}_{(j)}$ we get +\begin{align*} + \ten{X}_{(j)}\t{\ten{X}_{(j)}} \overset{\text{SVD}}{=} \mat{U}_j\mat{D}_j\t{\mat{U}_j} + = \mat{\alpha}_j(\ten{F}\times_{i\in[r]\backslash j}\mat{\alpha}_i)_{(j)} + \t{(\ten{F}\times_{i\in[r]\backslash j}\mat{\alpha}_i)_{(j)}}\t{\mat{\alpha}_j} +\end{align*} +for the $j = 1, ..., r$ modes. Using this relation we construct an iterative estimation process by setting the initial estimates of $\hat{\mat{\alpha}}_j^{(0)} = \mat{U}_j[, 1:q_j]$ which are the first $q_j$ columns of $\mat{U}_j$. + +\todo{continue} + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Numerical Examples %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -596,6 +664,31 @@ if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otime f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right). \end{displaymath} +\section{Sampling form a Multi-Array Normal Distribution} +Let $\ten{X}$ be an order (rank) $r$ Multi-Array random variable of dimensions $p_1\times...\times p_r$ following a Multi-Array (or Tensor) Normal distributed +\begin{displaymath} + \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r). +\end{displaymath} +Its density is given by +\begin{displaymath} + f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1} + \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right) +\end{displaymath} +with $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution +\begin{displaymath} + \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1) +\end{displaymath} +with $p = \prod_{i = 1}^r p_i$. + +When sampling from the Multi-Array Normal one way is to sample from the Multi-Variate Normal and then reshaping the result, but this is usually very inefficient because it requires to store the multi-variate covariance matrix which is very big. Instead, it is more efficient to sample $\ten{Z}$ as a tensor of the same shape as $\ten{X}$ with standard normal entries and then transform the $\ten{Z}$ to follow the Multi-Array Normal as follows +\begin{displaymath} + \ten{Z}\sim\mathcal{TN}(0, \mat{I}_{p_1}, ..., \mat{I}_{p_r}) + \quad\Rightarrow\quad + \ten{X} = \ten{Z}\times\{\mat{\Delta}_1^{1/2}, ..., \mat{\Delta}_r^{1/2}\} + \mu\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r). +\end{displaymath} +where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal. + +\todo{Check this!!!} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Reference Summaries %%% diff --git a/simulations/kpir_sim.R b/simulations/kpir_sim.R index c10b83e..e15de28 100644 --- a/simulations/kpir_sim.R +++ b/simulations/kpir_sim.R @@ -2,6 +2,16 @@ library(tensorPredictors) library(dplyr) library(ggplot2) +## Logger callbacks +log.prog <- function(max.iter) { + function(iter, loss, alpha, beta, ...) { + select <- as.integer(seq(1, max.iter, len = 30) <= iter) + cat("\r[", paste(c(" ", "=")[1 + select], collapse = ""), + "] ", iter, "/", max.iter, sep = "") + } +} + + ### Exec all methods for a given data set and collect logs ### sim <- function(X, Fy, shape, alpha.true, beta.true, max.iter = 500L) { @@ -61,44 +71,40 @@ sim <- function(X, Fy, shape, alpha.true, beta.true, max.iter = 500L) { rbind(hist.base, hist.new, hist.momentum, hist.approx) #, hist.kron } -## Plot helper functions -plot.hist <- function(hist, response, ...) { - ggplot(hist, aes(x = iter, color = method, group = interaction(method, repetition))) + - geom_line(aes_(y = as.name(response)), na.rm = TRUE) + - geom_point(data = with(sub <- subset(hist, !is.na(as.symbol(response))), - aggregate(sub, list(method, repetition), tail, 1) - ), aes_(y = as.name(response))) + - labs(...) + - theme(legend.position = "bottom") -} -plot.stats <- function(hist, response, ..., title = "Stats") { - ggplot(hist, aes_(x = quote(iter), y = as.name(response), - color = quote(method), group = quote(method))) + - geom_ribbon(aes(color = NULL, fill = method), alpha = 0.2, - stat = "summary", fun.min = "min", fun.max = "max", na.rm = TRUE) + - geom_ribbon(aes(color = NULL, fill = method), alpha = 0.4, - stat = "summary", na.rm = TRUE, - fun.min = function(y) quantile(y, 0.25), - fun.max = function(y) quantile(y, 0.75)) + - geom_line(stat = "summary", fun = "mean", na.rm = TRUE) + - labs(title = title, ...) + - theme(legend.position = "bottom") -} -plot.mean <- function(hist, response, ..., title = "Mean") { - ggplot(hist, aes_(x = quote(iter), y = as.name(response), - color = quote(method), group = quote(method))) + - geom_line(stat = "summary", fun = "mean", na.rm = TRUE) + - labs(title = title, ...) + - theme(legend.position = "bottom") -} -plot.median <- function(hist, response, ..., title = "Median") { - ggplot(hist, aes_(x = quote(iter), y = as.name(response), - color = quote(method), group = quote(method))) + - geom_line(stat = "summary", fun = "median", na.rm = TRUE) + - labs(title = title, ...) + - theme(legend.position = "bottom") +## Plot helper function +plot.hist2 <- function(hist, response, type = "all", ...) { + # Extract final results from history + sub <- na.omit(hist[c("iter", response, "method", "repetition")]) + sub <- aggregate(sub, list(sub$method, sub$repetition), tail, 1) + + # Setup ggplot + p <- ggplot(hist, aes_(x = quote(iter), + y = as.name(response), + color = quote(method), + group = quote(interaction(method, repetition)))) + # Add requested layers + if (type == "all") { + p <- p + geom_line(na.rm = TRUE) + p <- p + geom_point(data = sub) + } else if (type == "mean") { + p <- p + geom_line(alpha = 0.5, na.rm = TRUE, linetype = "dotted") + p <- p + geom_point(data = sub, alpha = 0.5) + p <- p + geom_line(aes(group = method), + stat = "summary", fun = "mean", na.rm = TRUE) + } else if (type == "median") { + p <- p + geom_line(alpha = 0.5, na.rm = TRUE, linetype = "dotted") + p <- p + geom_point(data = sub, alpha = 0.5) + p <- p + geom_line(aes(group = method), + stat = "summary", fun = "median", na.rm = TRUE) + } + # return with theme and annotations + p + labs(...) + theme(legend.position = "bottom") } +################################################################################ +### Sim 1 / vec(X) has AR(0.5) Covariance ### +################################################################################ + ## Generate some test data / DEBUG n <- 200 # Sample Size p <- sample(1:15, 1) # 11 @@ -137,39 +143,27 @@ saveRDS(hist, file = sprintf("AR_%s.rds", datetime)) # for GGPlot2, as factors for grouping hist$repetition <- factor(hist$repetition) -plot.hist(hist, "loss") -dev.print(png, file = sprintf("sim01_loss_%s.png", datetime), width = 768, height = 768, res = 125) -plot.stats(hist, "loss") -dev.print(png, file = sprintf("sim01_loss_stats_%s.png", datetime), width = 768, height = 768, res = 125) -plot.hist(hist, "dist") -dev.print(png, file = sprintf("sim01_dist_%s.png", datetime), width = 768, height = 768, res = 125) -plot.stats(hist, "dist") -dev.print(png, file = sprintf("sim01_dist_stats_%s.png", datetime), width = 768, height = 768, res = 125) - -plot.hist(hist, "dist.alpha") -dev.print(png, file = sprintf("sim01_dist_alpha_%s.png", datetime), width = 768, height = 768, res = 125) -plot.stats(hist, "dist.alpha") -dev.print(png, file = sprintf("sim01_dist_alpha_stats_%s.png", datetime), width = 768, height = 768, res = 125) - -plot.hist(hist, "dist.beta") -dev.print(png, file = sprintf("sim01_dist_beta_%s.png", datetime), width = 768, height = 768, res = 125) -plot.stats(hist, "dist.beta") -dev.print(png, file = sprintf("sim01_dist_beta_stats_%s.png", datetime), width = 768, height = 768, res = 125) - -plot.hist(hist, "norm.alpha") -dev.print(png, file = sprintf("sim01_norm_alpha_%s.png", datetime), width = 768, height = 768, res = 125) -plot.stats(hist, "norm.alpha") -dev.print(png, file = sprintf("sim01_norm_alpha_stats_%s.png", datetime), width = 768, height = 768, res = 125) - -plot.hist(hist, "norm.beta") -dev.print(png, file = sprintf("sim01_norm_beta_%s.png", datetime), width = 768, height = 768, res = 125) -plot.stats(hist, "norm.beta") -dev.print(png, file = sprintf("sim01_norm_beta_stats_%s.png", datetime), width = 768, height = 768, res = 125) +# Save simulation results +sim.name <- "sim01" +datetime <- format(Sys.time(), "%Y%m%dT%H%M") +saveRDS(hist, file = sprintf("%s_%s.rds", sim.name, datetime)) +# for GGPlot2, as factors for grouping +hist$repetition <- factor(hist$repetition) +for (response in c("loss", "dist", "dist.alpha", "dist.beta")) { + for (fun in c("all", "mean", "median")) { + print(plot.hist2(hist, response, fun, title = fun) + coord_trans(x = "log1p")) + dev.print(png, file = sprintf("%s_%s_%s_%s.png", sim.name, datetime, response, fun), + width = 768, height = 768, res = 125) + } +} +################################################################################ +### Sim 2 / X has AR(0.707) %x% AR(0.707) Covariance ### +################################################################################ n <- 200 # Sample Size p <- 11 # sample(1:15, 1) @@ -206,80 +200,90 @@ for (rep in 1:reps) { # Save simulation results +sim.name <- "sim02" datetime <- format(Sys.time(), "%Y%m%dT%H%M") -saveRDS(hist, file = sprintf("sim02_%s.rds", datetime)) +saveRDS(hist, file = sprintf("%s_%s.rds", sim.name, datetime)) # for GGPlot2, as factors for grouping hist$repetition <- factor(hist$repetition) -plot.hist(hist, "loss") -dev.print(png, file = sprintf("sim02_loss_%s.png", datetime), width = 768, height = 768, res = 125) -plot.stats(hist, "loss") -dev.print(png, file = sprintf("sim02_loss_stats_%s.png", datetime), width = 768, height = 768, res = 125) - -plot.hist(hist, "dist") -dev.print(png, file = sprintf("sim02_dist_%s.png", datetime), width = 768, height = 768, res = 125) -plot.stats(hist, "dist") -dev.print(png, file = sprintf("sim02_dist_stats_%s.png", datetime), width = 768, height = 768, res = 125) -plot.mean(hist, "dist") -plot.median(hist, "dist") - -plot.hist(hist, "dist.alpha") -dev.print(png, file = sprintf("sim02_dist_alpha_%s.png", datetime), width = 768, height = 768, res = 125) -plot.stats(hist, "dist.alpha") -dev.print(png, file = sprintf("sim02_dist_alpha_stats_%s.png", datetime), width = 768, height = 768, res = 125) -plot.mean(hist, "dist.alpha") -plot.median(hist, "dist.alpha") - -plot.hist(hist, "dist.beta") -dev.print(png, file = sprintf("sim02_dist_beta_%s.png", datetime), width = 768, height = 768, res = 125) -plot.stats(hist, "dist.beta") -dev.print(png, file = sprintf("sim02_dist_beta_stats_%s.png", datetime), width = 768, height = 768, res = 125) -plot.mean(hist, "dist.beta") -plot.median(hist, "dist.beta") - -plot.hist(hist, "norm.alpha") -dev.print(png, file = sprintf("sim02_norm_alpha_%s.png", datetime), width = 768, height = 768, res = 125) -plot.stats(hist, "norm.alpha") -dev.print(png, file = sprintf("sim02_norm_alpha_stats_%s.png", datetime), width = 768, height = 768, res = 125) - -plot.hist(hist, "norm.beta") -dev.print(png, file = sprintf("sim02_norm_beta_%s.png", datetime), width = 768, height = 768, res = 125) -plot.stats(hist, "norm.beta") -dev.print(png, file = sprintf("sim02_norm_beta_stats_%s.png", datetime), width = 768, height = 768, res = 125) - -plot.hist2 <- function(hist, response, type = "all", ...) { - # Extract final results from history - sub <- na.omit(hist[c("iter", response, "method", "repetition")]) - sub <- aggregate(sub, list(sub$method, sub$repetition), tail, 1) - - # Setup ggplot - p <- ggplot(hist, aes_(x = quote(iter), - y = as.name(response), - color = quote(method), - group = quote(interaction(method, repetition)))) - # Add requested layers - if (type == "all") { - p <- p + geom_line(na.rm = TRUE) - p <- p + geom_point(data = sub) - } else if (type == "mean") { - p <- p + geom_line(alpha = 0.5, na.rm = TRUE, linetype = "dotted") - p <- p + geom_point(data = sub, alpha = 0.5) - p <- p + geom_line(aes(group = method), - stat = "summary", fun = "mean", na.rm = TRUE) - } else if (type == "median") { - p <- p + geom_line(alpha = 0.5, na.rm = TRUE, linetype = "dotted") - p <- p + geom_point(data = sub, alpha = 0.5) - p <- p + geom_line(aes(group = method), - stat = "summary", fun = "median", na.rm = TRUE) +for (response in c("loss", "dist", "dist.alpha", "dist.beta")) { + for (fun in c("all", "mean", "median")) { + print(plot.hist2(hist, response, fun, title = fun) + coord_trans(x = "log1p")) + dev.print(png, file = sprintf("%s_%s_%s_%s.png", sim.name, datetime, response, fun), + width = 768, height = 768, res = 125) } - # return with theme and annotations - p + labs(...) + theme(legend.position = "bottom") } -plot.hist2(hist, "dist.alpha", "all", title = "all") + coord_trans(x = "log1p") -plot.hist2(hist, "dist.alpha", "mean", title = "mean") + coord_trans(x = "log1p") -plot.hist2(hist, "dist.alpha", "median", title = "median") + coord_trans(x = "log1p") +################################################################################ +### WIP ### +################################################################################ +n <- 200 # Sample Size +p <- 11 # sample(1:15, 1) +q <- 3 # sample(1:15, 1) +k <- 7 # sample(1:15, 1) +r <- 5 # sample(1:15, 1) +print(c(n, p, q, k, r)) + +alpha.true <- alpha <- matrix(rnorm(q * r), q, r) +beta.true <- beta <- matrix(rnorm(p * k), p, k) +y <- rnorm(n) +Fy <- do.call(cbind, Map(function(slope, offset) { + sin(slope * y + offset) + }, + head(rep(seq(1, ceiling(0.5 * k * r)), each = 2), k * r), + head(rep(c(0, pi / 2), ceiling(0.5 * k * r)), k * r) +)) +X <- tcrossprod(Fy, kronecker(alpha, beta)) + CVarE:::rmvnorm(n, sigma = Delta) + +Delta.1 <- sqrt(0.5)^abs(outer(seq_len(q), seq_len(q), `-`)) +Delta.2 <- sqrt(0.5)^abs(outer(seq_len(p), seq_len(p), `-`)) +Delta <- kronecker(Delta.1, Delta.2) + +shape <- c(p, q, k, r) + +# Base (old) +Rprof(fit.base <- kpir.base(X, Fy, shape, max.iter = 500, logger = prog(500))) + +# New (simple Gradient Descent) +Rprof(fit.new <- kpir.new(X, Fy, shape, max.iter = 500, logger = prog(500))) + +# Gradient Descent with Nesterov Momentum +Rprof(fit.momentum <- kpir.momentum(X, Fy, shape, max.iter = 500, logger = prog(500))) + +# # Residual Covariance Kronecker product assumpton version +# Rprof(fit.kron <- kpir.kron(X, Fy, shape, max.iter = 500, logger = prog(500))) + +# Approximated MLE with Nesterov Momentum +Rprof("kpir.approx.Rprof") +fit.approx <- kpir.approx(X, Fy, shape, max.iter = 500, logger = prog(500)) +summaryRprof("kpir.approx.Rprof") + +par(mfrow = c(2, 2)) +matrixImage(Delta, main = expression(Delta)) +matrixImage(fit.base$Delta, main = expression(hat(Delta)), sub = "base") +matrixImage(fit.momentum$Delta, main = expression(hat(Delta)), sub = "momentum") +matrixImage(kronecker(fit.approx$Delta.1, fit.approx$Delta.2), main = expression(hat(Delta)), sub = "approx") + +par(mfrow = c(2, 2)) +matrixImage(Delta.1, main = expression(Delta[1])) +matrixImage(fit.approx$Delta.1, main = expression(hat(Delta)[1]), sub = "approx") +matrixImage(Delta.2, main = expression(Delta[2])) +matrixImage(fit.approx$Delta.2, main = expression(hat(Delta)[2]), sub = "approx") + +par(mfrow = c(2, 2)) +matrixImage(alpha.true, main = expression(alpha)) +matrixImage(fit.base$alpha, main = expression(hat(alpha)), sub = "base") +matrixImage(fit.momentum$alpha, main = expression(hat(alpha)), sub = "momentum") +matrixImage(fit.approx$alpha, main = expression(hat(alpha)), sub = "approx") + +par(mfrow = c(2, 2)) +matrixImage(beta.true, main = expression(beta)) +matrixImage(fit.base$beta, main = expression(hat(beta)), sub = "base") +matrixImage(fit.momentum$beta, main = expression(hat(beta)), sub = "momentum") +matrixImage(fit.approx$beta, main = expression(hat(beta)), sub = "approx") + + ################################################################################ diff --git a/tensorPredictors/R/kpir_approx.R b/tensorPredictors/R/kpir_approx.R index 084f5ad..5520a9a 100644 --- a/tensorPredictors/R/kpir_approx.R +++ b/tensorPredictors/R/kpir_approx.R @@ -78,17 +78,17 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), R <- X - (Fy %x_3% alpha0 %x_2% beta0) # Covariance estimates and scaling factor - Delta.1 <- tcrossprod(mat(R, 3)) - Delta.2 <- tcrossprod(mat(R, 2)) - s <- sum(diag(Delta.1)) + Delta.1 <- tcrossprod(mat(R, 3)) / n + Delta.2 <- tcrossprod(mat(R, 2)) / n + s <- mean(diag(Delta.1)) # Inverse Covariances Delta.1.inv <- solve(Delta.1) Delta.2.inv <- solve(Delta.2) # cross dependent covariance estimates - S.1 <- n^-1 * tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) - S.2 <- n^-1 * tcrossprod(mat(R, 2), mat(R %x_3% Delta.1.inv, 2)) + S.1 <- tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) / n + S.2 <- tcrossprod(mat(R, 2), mat(R %x_3% Delta.1.inv, 2)) / n # Evaluate negative log-likelihood (2 pi term dropped) loss <- -0.5 * (n * (p * q * log(s) - p * log(det(Delta.1)) - @@ -124,17 +124,17 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), R <- X - (Fy %x_3% alpha.moment %x_2% beta.moment) # Recompute Covariance Estimates and scaling factor - Delta.1 <- tcrossprod(mat(R, 3)) - Delta.2 <- tcrossprod(mat(R, 2)) - s <- sum(diag(Delta.1)) + Delta.1 <- tcrossprod(mat(R, 3)) / n + Delta.2 <- tcrossprod(mat(R, 2)) / n + s <- mean(diag(Delta.1)) # Inverse Covariances Delta.1.inv <- solve(Delta.1) Delta.2.inv <- solve(Delta.2) # cross dependent covariance estimates - S.1 <- n^-1 * tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) - S.2 <- n^-1 * tcrossprod(mat(R, 2), mat(R %x_3% Delta.1.inv, 2)) + S.1 <- tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) / n + S.2 <- tcrossprod(mat(R, 2), mat(R %x_3% Delta.1.inv, 2)) / n # Gradient "generating" tensor G <- (sum(S.1 * Delta.1.inv) - p * q / s) * R @@ -161,12 +161,12 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])), # Update Residuals, Covariances, ... R <- X - (Fy %x_3% alpha.temp %x_2% beta.temp) - Delta.1 <- tcrossprod(mat(R, 3)) - Delta.2 <- tcrossprod(mat(R, 2)) - s <- sum(diag(Delta.1)) + Delta.1 <- tcrossprod(mat(R, 3)) / n + Delta.2 <- tcrossprod(mat(R, 2)) / n + s <- mean(diag(Delta.1)) Delta.1.inv <- solve(Delta.1) Delta.2.inv <- solve(Delta.2) - S.1 <- n^-1 * tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) + S.1 <- tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3)) / n # S.2 not needed # Re-evaluate negative log-likelihood diff --git a/tensorPredictors/R/mcrossprod.R b/tensorPredictors/R/mcrossprod.R new file mode 100644 index 0000000..00e41ee --- /dev/null +++ b/tensorPredictors/R/mcrossprod.R @@ -0,0 +1,21 @@ +#' Tensor Mode Crossproduct +#' +#' C = A_(m) t(A_(m)) +#' +#' For a matrix `A`, the first mode is `mcrossprod(A, 1)` equivalent to +#' `A %*% t(A)` (`tcrossprod`). On the other hand for mode two `mcrossprod(A, 2)` +#' the equivalence is `t(A) %*% A` (`crossprod`). +#' +#' @param A multi-dimensional array +#' @param mode index (1-indexed) +#' +#' @returns matrix of dimensions \code{dim(A)[mode] by dim(A)[mode]}. +#' +#' @note equivalent to \code{tcrossprod(mat(A, mode))} with around the same +#' performance but only allocates the result matrix. +#' +#' @export +mcrossprod <- function(A, mode = length(dim(A))) { + storage.mode(A) <- "double" + .Call("C_mcrossprod", A, as.integer(mode)) +} diff --git a/tensorPredictors/R/rtensornorm.R b/tensorPredictors/R/rtensornorm.R new file mode 100644 index 0000000..1880297 --- /dev/null +++ b/tensorPredictors/R/rtensornorm.R @@ -0,0 +1,49 @@ +#' Random sampling from a tensor (multi-array) normal distribution +#' +#' @examples +#' n <- 1000 +#' Sigma.1 <- 0.5^abs(outer(1:5, 1:5, "-")) +#' Sigma.2 <- diag(1:4) +#' X <- rtensornorm(n, 0, Sigma.1, Sigma.2) +#' +#' @export +rtensornorm <- function(n, mean, ..., sample.mode) { + # get covariance matrices + cov <- list(...) + + # get sample shape (dimensions of an observation) + shape <- unlist(Map(nrow, cov)) + + # result tensor dimensions + dims <- c(shape, n) + + # validate mean dimensions + if (!missing(mean)) { + stopifnot(is.numeric(mean)) + stopifnot(is.array(mean) || length(mean) == 1) + stopifnot(length(shape) == length(cov)) + stopifnot(all(shape == dim(mean))) + } + + # sample i.i.d. normal entries + X <- array(rnorm(prod(dims)), dim = dims) + + # transform from standard normal to tensor normal with given covariances + for (i in seq_along(cov)) { + X <- ttm(X, matpow(cov[[i]], 1 / 2), i) + } + + # add mean (using recycling, observations on last mode) + X <- X + mean + + # permute axis for indeing observations on sample mode (permute first axis + # with sample mode axis) + if (!missing(sample.mode)) { + axis <- seq_len(length(dims) - 1) + start <- seq_len(sample.mode - 1) + end <- seq_len(length(dims) - sample.mode) + sample.mode - 1 + X <- aperm(X, c(axis[start], length(dims), axis[end])) + } + + X +} diff --git a/tensorPredictors/src/mcrossprod.c b/tensorPredictors/src/mcrossprod.c new file mode 100644 index 0000000..498b26d --- /dev/null +++ b/tensorPredictors/src/mcrossprod.c @@ -0,0 +1,88 @@ +// The need for `USE_FC_LEN_T` and `FCONE` is due to a Fortran character string +// to C incompatibility. See: Writing R Extentions: 6.6.1 Fortran character strings +#define USE_FC_LEN_T +#include +#include +#include +#ifndef FCONE +#define FCONE +#endif + +/** + * Tensor Mode Crossproduct + * + * C = A_(m) t(A_(m)) + * + * For a matrix `A`, the first mode is `mcrossprod(A, 1)` equivalent to + * `A %*% t(A)` (`tcrossprod`). On the other hand for mode two `mcrossprod(A, 2)` + * the equivalence is `t(A) %*% A` (`crossprod`). + * + * @param A multi-dimensional array + * @param m mode index (1-indexed) + */ +extern SEXP mcrossprod(SEXP A, SEXP m) { + // get zero indexed mode + int mode = asInteger(m) - 1; + + // get dimension attribute of A + SEXP dim = getAttrib(A, R_DimSymbol); + + // validate mode (0-indexed, must be smaller than the tensor order) + if (mode < 0 || length(dim) <= mode) { + error("Illegal mode"); + } + + // the strides + // `stride[0] <- prod(dim(X)[seq_len(mode - 1)])` + // `stride[1] <- dim(X)[mode]` + // `stride[2] <- prod(dim(X)[-seq_len(mode)])` + int stride[3] = {1, INTEGER(dim)[mode], 1}; + for (int i = 0; i < length(dim); ++i) { + int size = INTEGER(dim)[i]; + stride[0] *= (i < mode) ? size : 1; + stride[2] *= (i > mode) ? size : 1; + } + + // create response matrix C + SEXP C = PROTECT(allocMatrix(REALSXP, stride[1], stride[1])); + + // raw data access pointers + double* a = REAL(A); + double* c = REAL(C); + + // employ BLAS dsyrk (Double SYmmeric Rank K) operation + // (C = alpha A A^T + beta C or C = alpha A^T A + beta C) + const double zero = 0.0; + const double one = 1.0; + if (mode == 0) { + // mode 1: special case C = A_(1) A_(1)^T + // C = 1 A A^T + 0 C + F77_CALL(dsyrk)("U", "N", &stride[1], &stride[2], + &one, a, &stride[1], &zero, c, &stride[1] FCONE FCONE); + } else { + // Other modes writen as accumulated sum of matrix products + // initialize C to zero + memset(c, 0, stride[1] * stride[1] * sizeof(double)); + + // Sum over all modes > mode + for (int i2 = 0; i2 < stride[2]; ++i2) { + // C = 1 A^T A + 1 C + F77_CALL(dsyrk)("U", "T", &stride[1], &stride[0], + &one, &a[i2 * stride[0] * stride[1]], &stride[0], + &one, c, &stride[1] FCONE FCONE); + } + } + + // Symmetric matrix result is stored in upper triangular part only + // Copy upper triangular part to lower + for (int j = 0; j + 1 < stride[1]; j++) { + for (int i = j + 1; i < stride[1]; ++i) { + c[i + j * stride[1]] = c[j + i * stride[1]]; + } + } + + // release C to grabage collector + UNPROTECT(1); + + return C; +}