Compare commits
4 Commits
79f5e9781f
...
203028e255
Author | SHA1 | Date |
---|---|---|
Daniel Kapla | 203028e255 | |
Daniel Kapla | b2c6d3ca56 | |
Daniel Kapla | 8ee64ae48c | |
Daniel Kapla | fcef12540f |
|
@ -0,0 +1,63 @@
|
|||
@article{RegMatrixReg-ZhouLi2014,
|
||||
author = {Zhou, Hua and Li, Lexin},
|
||||
title = {Regularized matrix regression},
|
||||
journal = {Journal of the Royal Statistical Society. Series B (Statistical Methodology)},
|
||||
volume = {76},
|
||||
number = {2},
|
||||
pages = {463--483},
|
||||
year = {2014},
|
||||
publisher = {[Royal Statistical Society, Wiley]}
|
||||
}
|
||||
|
||||
@book{StatInf-CasellaBerger2002,
|
||||
title = {{Statistical Inference}},
|
||||
author = {Casella, George and Berger, Roger L.},
|
||||
isbn = {0-534-24312-6},
|
||||
series = {Duxbury Advanced Series},
|
||||
year = {2002},
|
||||
edition = {2},
|
||||
publisher = {Thomson Learning}
|
||||
}
|
||||
|
||||
@book{MatrixDiffCalc-MagnusNeudecker1999,
|
||||
title = {Matrix Differential Calculus with Applications in Statistics and Econometrics (Revised Edition)},
|
||||
author = {Magnus, Jan R. and Neudecker, Heinz},
|
||||
year = {1999},
|
||||
publisher = {John Wiley \& Sons Ltd},
|
||||
isbn = {0-471-98632-1}
|
||||
}
|
||||
|
||||
@book{MatrixAlgebra-AbadirMagnus2005,
|
||||
title = {Matrix Algebra},
|
||||
author = {Abadir, Karim M. and Magnus, Jan R.},
|
||||
year = {2005},
|
||||
publisher = {Cambridge University Press},
|
||||
series = {Econometric Exercises},
|
||||
collection = {Econometric Exercises},
|
||||
place = {Cambridge},
|
||||
doi = {10.1017/CBO9780511810800}
|
||||
}
|
||||
|
||||
@article{TensorDecomp-HuLeeWang2022,
|
||||
author = {Hu, Jiaxin and Lee, Chanwoo and Wang, Miaoyan},
|
||||
title = {Generalized Tensor Decomposition With Features on Multiple Modes},
|
||||
journal = {Journal of Computational and Graphical Statistics},
|
||||
volume = {31},
|
||||
number = {1},
|
||||
pages = {204-218},
|
||||
year = {2022},
|
||||
publisher = {Taylor \& Francis},
|
||||
doi = {10.1080/10618600.2021.1978471},
|
||||
}
|
||||
|
||||
@article{CovarEstSparseKron-LengPan2018,
|
||||
author = {Leng, Chenlei and Pan, Guangming},
|
||||
title = {{Covariance estimation via sparse Kronecker structures}},
|
||||
volume = {24},
|
||||
journal = {Bernoulli},
|
||||
number = {4B},
|
||||
publisher = {Bernoulli Society for Mathematical Statistics and Probability},
|
||||
pages = {3833 -- 3863},
|
||||
year = {2018},
|
||||
doi = {10.3150/17-BEJ980}
|
||||
}
|
|
@ -0,0 +1,631 @@
|
|||
\documentclass[a4paper, 10pt]{article}
|
||||
|
||||
\usepackage[utf8]{inputenc}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage{fullpage}
|
||||
\usepackage{amsmath, amssymb, amstext, amsthm}
|
||||
\usepackage{bm} % \boldsymbol and italic corrections, ...
|
||||
\usepackage[pdftex]{hyperref}
|
||||
\usepackage{makeidx} % Index (Symbols, Names, ...)
|
||||
\usepackage{xcolor, graphicx} % colors and including images
|
||||
\usepackage{tikz}
|
||||
\usepackage[
|
||||
% backend=bibtex,
|
||||
style=authoryear-comp
|
||||
]{biblatex}
|
||||
|
||||
% Document meta into
|
||||
\title{Derivation of Gradient Descent Algorithm for K-PIR}
|
||||
\author{Daniel Kapla}
|
||||
\date{November 24, 2021}
|
||||
% Set PDF title, author and creator.
|
||||
\AtBeginDocument{
|
||||
\hypersetup{
|
||||
pdftitle = {Derivation of Gradient Descent Algorithm for K-PIR},
|
||||
pdfauthor = {Daniel Kapla},
|
||||
pdfcreator = {\pdftexbanner}
|
||||
}
|
||||
}
|
||||
|
||||
\makeindex
|
||||
|
||||
% Bibliography resource(s)
|
||||
\addbibresource{main.bib}
|
||||
|
||||
% Setup environments
|
||||
% Theorem, Lemma
|
||||
\theoremstyle{plain}
|
||||
\newtheorem{theorem}{Theorem}
|
||||
\newtheorem{lemma}{Lemma}
|
||||
\newtheorem{example}{Example}
|
||||
% Definition
|
||||
\theoremstyle{definition}
|
||||
\newtheorem{defn}{Definition}
|
||||
% Remark
|
||||
\theoremstyle{remark}
|
||||
\newtheorem{remark}{Remark}
|
||||
|
||||
% Define math macros
|
||||
\newcommand{\mat}[1]{\boldsymbol{#1}}
|
||||
\newcommand{\ten}[1]{\mathcal{#1}}
|
||||
\renewcommand{\vec}{\operatorname{vec}}
|
||||
\DeclareMathOperator{\kron}{\otimes} % Kronecker Product
|
||||
\DeclareMathOperator{\hada}{\odot} % Hadamard Product
|
||||
\newcommand{\ttm}[1][n]{\times_{#1}} % n-mode product (Tensor Times Matrix)
|
||||
\DeclareMathOperator{\df}{\operatorname{df}}
|
||||
\DeclareMathOperator{\tr}{\operatorname{tr}}
|
||||
\DeclareMathOperator{\var}{Var}
|
||||
\DeclareMathOperator{\cov}{Cov}
|
||||
\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
|
||||
% \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}}
|
||||
\DeclareMathOperator*{\argmin}{{arg\,min}}
|
||||
\DeclareMathOperator*{\argmax}{{arg\,max}}
|
||||
\newcommand{\D}{\textnormal{D}}
|
||||
\renewcommand{\d}{\textnormal{d}}
|
||||
\renewcommand{\t}[1]{{{#1}'}}
|
||||
\newcommand{\pinv}[1]{{{#1}^{\dagger}}} % `Moore-Penrose pseudoinverse`
|
||||
\newcommand{\todo}[1]{{\color{red}TODO: #1}}
|
||||
|
||||
\begin{document}
|
||||
|
||||
\maketitle
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%%% Introduction %%%
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Introduction}
|
||||
We assume the model
|
||||
\begin{displaymath}
|
||||
\mat{X} = \mat{\mu} + \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}
|
||||
\end{displaymath}
|
||||
where the dimensions of all the components are listed in Table~\ref{tab:dimensions}.
|
||||
and its vectorized form
|
||||
\begin{displaymath}
|
||||
\vec\mat{X} = \vec\mat{\mu} + (\mat{\alpha}\kron\mat{\beta})\vec\mat{f}_y + \vec\mat{\epsilon}
|
||||
\end{displaymath}
|
||||
|
||||
\begin{table}[!htp]
|
||||
\centering
|
||||
\begin{minipage}{0.8\textwidth}
|
||||
\centering
|
||||
\begin{tabular}{l l}
|
||||
$\mat X, \mat\mu, \mat R, \mat\epsilon$ & $p\times q$ \\
|
||||
$\mat{f}_y$ & $k\times r$ \\
|
||||
$\mat\alpha$ & $q\times r$ \\
|
||||
$\mat\beta$ & $p\times k$ \\
|
||||
$\mat\Delta$ & $p q\times p q$ \\
|
||||
$\mat\Delta_1$ & $q\times q$ \\
|
||||
$\mat\Delta_2$ & $p\times p$ \\
|
||||
$\mat{r}$ & $p q\times 1$ \\
|
||||
\hline
|
||||
$\ten{X}, \ten{R}$ & $n\times p\times q$ \\
|
||||
$\ten{F}$ & $n\times k\times r$ \\
|
||||
\end{tabular}
|
||||
\caption{\label{tab:dimensions}\small Summary listing of dimensions with the corresponding sample versions $\mat{X}_i, \mat{R}_i, \mat{r}_i, \mat{f}_{y_i}$ for $i = 1, ..., n$ as well as estimates $\widehat{\mat{\alpha}}, \widehat{\mat{\beta}}, \widehat{\mat\Delta}, \widehat{\mat\Delta}_1$ and $\widehat{\mat\Delta}_2$.}
|
||||
\end{minipage}
|
||||
\end{table}
|
||||
|
||||
The log-likelihood $l$ given $n$ i.i.d. observations assuming that $\mat{X}_i\mid(Y = y_i)$ is normal distributed as
|
||||
\begin{displaymath}
|
||||
\vec\mat{X}_i \sim \mathcal{N}_{p q}(\vec\mat\mu + (\mat\alpha\kron\mat\beta)\vec\mat{f}_{y_i}, \Delta)
|
||||
\end{displaymath}
|
||||
Replacing all unknown by there estimates gives the (estimated) log-likelihood
|
||||
\begin{equation}\label{eq:log-likelihood-est}
|
||||
\hat{l}(\mat\alpha, \mat\beta) = -\frac{n q p}{2}\log 2\pi - \frac{n}{2}\log|\widehat{\mat\Delta}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat\Delta}^{-1}\mat{r}_i
|
||||
\end{equation}
|
||||
where the residuals are
|
||||
\begin{displaymath}
|
||||
\mat{r}_i = \vec\mat{X}_i - \vec\overline{\mat{X}} - (\mat\alpha\kron\mat\beta)\vec{\mat f}_{y_i}\qquad (p q \times 1)
|
||||
\end{displaymath}
|
||||
and the MLE estimate assuming $\mat\alpha, \mat\beta$ known for the covariance matrix $\widehat{\mat\Delta}$ as solution to the score equations is
|
||||
\begin{equation}\label{eq:Delta}
|
||||
\widehat{\mat\Delta} = \frac{1}{n}\sum_{i = 1}^n \mat{r}_i \t{\mat{r}_i} \qquad(p q \times p q).
|
||||
\end{equation}
|
||||
Note that the log-likelihood estimate $\hat{l}$ only depends on $\mat\alpha, \mat\beta$. Next, we compute the gradient for $\mat\alpha$ and $\mat\beta$ of $\hat{l}$ used to formulate a Gradient Descent base estimation algorithm for $\mat\alpha, \mat\beta$ as the previous algorithmic. The main reason is to enable an estimation for bigger dimensions of the $\mat\alpha, \mat\beta$ coefficients since the previous algorithm does \emph{not} solve the high run time problem for bigger dimensions.
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%%% Derivative %%%
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Derivative of the Log-Likelihood}
|
||||
Start with the general case of $\mat X_i|(Y_i = y_i)$ is multivariate normal distributed with the covariance $\mat\Delta$ being a $p q\times p q$ positive definite symmetric matrix \emph{without} an further assumptions. We have $i = 1, ..., n$ observations following
|
||||
\begin{displaymath}
|
||||
\mat{r}_i = \vec(\mat X_i - \mat\mu - \mat\beta\mat{f}_{y_i}\t{\mat\alpha}) \sim \mathcal{N}_{p q}(\mat 0, \mat\Delta).
|
||||
\end{displaymath}
|
||||
The MLE estimates of $\mat\mu, \mat\Delta$ are
|
||||
\begin{displaymath}
|
||||
\widehat{\mat\mu} = \overline{\mat X} = \frac{1}{n}\sum_{i = 1}^n \mat X_i {\color{gray}\qquad(p\times q)},
|
||||
\qquad \widehat{\mat\Delta} = \frac{1}{n}\sum_{i = 1}^n \mat r_i\t{\mat r_i} {\color{gray}\qquad(p q\times p q)}.
|
||||
\end{displaymath}
|
||||
Substitution of the MLE estimates into the log-likelihood $l(\mat\mu, \mat\Delta, \mat\alpha, \mat\beta)$ gives the estimated log-likelihood $\hat{l}(\mat\alpha, \mat\beta)$ as
|
||||
\begin{displaymath}
|
||||
\hat{l}(\mat\alpha, \mat\beta) = -\frac{n q p}{2}\log 2\pi - \frac{n}{2}\log|\widehat{\mat\Delta}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat\Delta}^{-1}\mat{r}_i.
|
||||
\end{displaymath}
|
||||
We are interested in the gradients $\nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta)$, $\nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)$ of the estimated log-likelihood. Therefore, we consider the differential of $\hat{l}$.
|
||||
\begin{align}
|
||||
\d\hat{l}(\mat\alpha, \mat\beta)
|
||||
&= -\frac{n}{2}\log|\widehat{\mat{\Delta}}| - \frac{1}{2}\sum_{i = 1}^n \big(\t{(\d \mat{r}_i)}\widehat{\mat{\Delta}}^{-1} \mat{r}_i + \t{\mat{r}_i}(\d\widehat{\mat{\Delta}}^{-1}) \mat{r}_i + \t{\mat{r}_i}\widehat{\mat{\Delta}}^{-1} \d \mat{r}_i\big) \nonumber\\
|
||||
&= \underbrace{-\frac{n}{2}\log|\widehat{\mat{\Delta}}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}(\d\widehat{\mat{\Delta}}^{-1}) \mat{r}_i}_{=\,0\text{ due to }\widehat{\mat{\Delta}}\text{ beeing the MLE}} \label{eq:deriv1}
|
||||
- \sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat{\Delta}}^{-1} \d \mat{r}_i.
|
||||
\end{align}
|
||||
The next step is to compute $\d \mat{r}_i$ which depends on both $\mat\alpha$ and $\mat\beta$
|
||||
\begin{align*}
|
||||
\d\mat{r}_i(\mat\alpha, \mat\beta)
|
||||
&= -\d(\mat\alpha\kron \mat\beta)\vec\mat{f}_{y_i} \\
|
||||
&= -\vec\!\big( \mat{I}_{p q}\,\d(\mat\alpha\kron \mat\beta)\vec\mat{f}_{y_i} \big) \\
|
||||
&= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})\,\d\vec(\mat\alpha\kron \mat\beta) \\
|
||||
\intertext{using the identity \ref{eq:vecKron}, to obtain vectorized differentials, gives}
|
||||
\dots
|
||||
&= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \,\d(\vec \mat\alpha\kron \vec \mat\beta) \\
|
||||
&= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big((\d\vec \mat\alpha)\kron \vec \mat\beta + \vec \mat\alpha\kron (\d\vec \mat\beta)\big) \\
|
||||
&= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big(\mat{I}_{r q}(\d\vec \mat\alpha)\kron (\vec \mat\beta)\mat{I}_1 + (\vec \mat\alpha)\mat{I}_1\kron \mat{I}_{k p}(\d\vec \mat\beta)\big) \\
|
||||
&= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big((\mat{I}_{r q}\kron\vec \mat\beta)\d\vec \mat\alpha + (\vec \mat\alpha\kron \mat{I}_{k p})\d\vec \mat\beta\big)
|
||||
\end{align*}
|
||||
Now, substitution of $\d\mat{r}_i$ into \eqref{eq:deriv1} gives the gradients (not dimension standardized versions of $\D\hat{l}(\mat\alpha)$, $\D\hat{l}(\mat\beta)$) by identification of the derivatives from the differentials (see: \todo{appendix})
|
||||
\begin{align*}
|
||||
\nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta) &=
|
||||
\sum_{i = 1}^n (\t{\vec(\mat{f}_{y_i})}\kron\t{\mat{r}_i}\widehat{\mat\Delta}^{-1}) (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) (\mat{I}_{r q}\kron\vec \mat\beta),
|
||||
{\color{gray}\qquad(q\times r)} \\
|
||||
\nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta) &=
|
||||
\sum_{i = 1}^n (\t{\vec(\mat{f}_{y_i})}\kron\t{\mat{r}_i}\widehat{\mat\Delta}^{-1}) (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) (\vec \mat\alpha\kron \mat{I}_{k p}).
|
||||
{\color{gray}\qquad(p\times k)}
|
||||
\end{align*}
|
||||
These quantities are very verbose as well as completely unusable for an implementation. By detailed analysis of the gradients we see that the main parts are only element permutations with a high sparsity. By defining the following compact matrix
|
||||
\begin{equation}\label{eq:permTransResponse}
|
||||
\mat G = \vec^{-1}_{q r}\bigg(\Big( \sum_{i = 1}^n \vec\mat{f}_{y_i}\otimes \widehat{\mat\Delta}^{-1}\mat{r}_i \Big)_{\pi(i)}\bigg)_{i = 1}^{p q k r}{\color{gray}\qquad(q r \times p k)}
|
||||
\end{equation}
|
||||
with $\pi$ being a permutation of $p q k r$ elements corresponding to permuting the axis of a 4D tensor of dimensions $p\times q\times k\times r$ by $(2, 4, 1, 3)$. As a generalization of transposition this leads to a rearrangement of the elements corresponding to the permuted 4D tensor with dimensions $q\times r\times p\times k$ which is then vectorized and reshaped into a matrix of dimensions $q r \times p k$. With $\mat G$ the gradients simplify to \todo{validate this mathematically}
|
||||
\begin{align*}
|
||||
\nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta) &=
|
||||
\vec_{q}^{-1}(\mat{G}\vec{\mat\beta}),
|
||||
{\color{gray}\qquad(q\times r)} \\
|
||||
\nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta) &=
|
||||
\vec_{p}^{-1}(\t{\mat{G}}\vec{\mat\alpha}).
|
||||
{\color{gray}\qquad(p\times k)}
|
||||
\end{align*}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%%% Kronecker Covariance Structure %%%
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Kronecker Covariance Structure}
|
||||
Now we assume the residuals covariance has the form $\mat\Delta = \mat\Delta_1\otimes\mat\Delta_2$ where $\mat\Delta_1$, $\mat\Delta_2$ are $q\times q$, $p\times p$ covariance matrices, respectively. This is analog to the case that $\mat{R}_i$'s are i.i.d. Matrix Normal distribution
|
||||
\begin{displaymath}
|
||||
\mat{R}_i = \mat{X}_i - \mat\mu - \mat\beta\mat{f}_{y_i}\t{\mat\alpha} \sim \mathcal{MN}_{p\times q}(\mat 0, \mat\Delta_2, \mat\Delta_1).
|
||||
\end{displaymath}
|
||||
The density of the Matrix Normal (with mean zero) is equivalent to the vectorized quantities being multivariate normal distributed with Kronecker structured covariance
|
||||
\begin{align*}
|
||||
f(\mat R)
|
||||
&= \frac{1}{\sqrt{(2\pi)^{p q}|\mat\Delta|}}\exp\left(-\frac{1}{2}\t{\vec(\mat{R})} \mat\Delta^{-1}\vec(\mat{R})\right) \\
|
||||
&= \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{\mat{R}}\mat\Delta_2^{-1}\mat{R})\right)
|
||||
\end{align*}
|
||||
which leads for given data to the log-likelihood
|
||||
\begin{displaymath}
|
||||
l(\mat{\mu}, \mat\Delta_1, \mat\Delta_2) =
|
||||
-\frac{n p q}{2}\log 2\pi
|
||||
-\frac{n p}{2}\log|\mat{\Delta}_1|
|
||||
-\frac{n q}{2}\log|\mat{\Delta}_2|
|
||||
-\frac{1}{2}\sum_{i = 1}^n \tr(\mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i).
|
||||
\end{displaymath}
|
||||
\subsection{MLE covariance estimates}
|
||||
Out first order of business is to derive the MLE estimated of the covariance matrices $\mat\Delta_1$, $\mat\Delta_2$ (the mean estimate $\widehat{\mat\mu}$ is trivial). Therefore, we look at the differentials with respect to changes in the covariance matrices as
|
||||
\begin{align*}
|
||||
\d l(\mat\Delta_1, \mat\Delta_2) &=
|
||||
-\frac{n p}{2}\d\log|\mat{\Delta}_1|
|
||||
-\frac{n q}{2}\d\log|\mat{\Delta}_2|
|
||||
-\frac{1}{2}\sum_{i = 1}^n
|
||||
\tr( (\d\mat\Delta_1^{-1})\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i
|
||||
+ \mat\Delta_1^{-1}\t{\mat{R}_i}(\d\mat\Delta_2^{-1})\mat{R}_i) \\
|
||||
&=
|
||||
-\frac{n p}{2}\tr\mat{\Delta}_1^{-1}\d\mat{\Delta}_1
|
||||
-\frac{n q}{2}\tr\mat{\Delta}_2^{-1}\d\mat{\Delta}_2 \\
|
||||
&\qquad\qquad
|
||||
+\frac{1}{2}\sum_{i = 1}^n
|
||||
\tr( \mat\Delta_1^{-1}(\d\mat\Delta_1)\mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i
|
||||
+ \mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}(\d\mat\Delta_2)\mat\Delta_2^{-1}\mat{R}_i) \\
|
||||
&= \frac{1}{2}\tr\!\Big(\Big(
|
||||
-n p \mat{I}_q + \mat\Delta_1^{-1}\sum_{i = 1}^n \t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i
|
||||
\Big)\mat{\Delta}_1^{-1}\d\mat{\Delta}_1\Big) \\
|
||||
&\qquad\qquad
|
||||
+ \frac{1}{2}\tr\!\Big(\Big(
|
||||
-n q \mat{I}_p + \mat\Delta_2^{-1}\sum_{i = 1}^n \mat{R}_i\mat\Delta_1^{-1}\t{\mat{R}_i}
|
||||
\Big)\mat{\Delta}_2^{-1}\d\mat{\Delta}_2\Big) \overset{!}{=} 0.
|
||||
\end{align*}
|
||||
Setting $\d l$ to zero yields the MLE estimates as
|
||||
\begin{displaymath}
|
||||
\widehat{\mat{\mu}} = \overline{\mat X}{\color{gray}\quad(p\times q)}, \qquad
|
||||
\widehat{\mat\Delta}_1 = \frac{1}{n p}\sum_{i = 1}^n \t{\mat{R}_i}\widehat{\mat\Delta}_2^{-1}\mat{R}_i{\color{gray}\quad(q\times q)}, \qquad
|
||||
\widehat{\mat\Delta}_2 = \frac{1}{n q}\sum_{i = 1}^n \mat{R}_i\widehat{\mat\Delta}_1^{-1}\t{\mat{R}_i}{\color{gray}\quad(p\times p)}.
|
||||
\end{displaymath}
|
||||
Next, analog to above, we take the estimated log-likelihood and derive gradients with respect to $\mat{\alpha}$, $\mat{\beta}$.
|
||||
The estimated log-likelihood derives by replacing the unknown covariance matrices by there MLE estimates leading to
|
||||
\begin{displaymath}
|
||||
\hat{l}(\mat\alpha, \mat\beta) =
|
||||
-\frac{n p q}{2}\log 2\pi
|
||||
-\frac{n p}{2}\log|\widehat{\mat{\Delta}}_1|
|
||||
-\frac{n q}{2}\log|\widehat{\mat{\Delta}}_2|
|
||||
-\frac{1}{2}\sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i)
|
||||
\end{displaymath}
|
||||
and its differential
|
||||
\begin{displaymath}
|
||||
\d\hat{l}(\mat\alpha, \mat\beta) =
|
||||
-\frac{n p}{2}\d\log|\widehat{\mat{\Delta}}_1|
|
||||
-\frac{n q}{2}\d\log|\widehat{\mat{\Delta}}_2|
|
||||
-\frac{1}{2}\sum_{i = 1}^n \d\tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i).
|
||||
\end{displaymath}
|
||||
We first take a closer look at the sum. After a bit of algebra using $\d\mat A^{-1} = -\mat A^{-1}(\d\mat A)\mat A^{-1}$ and the definitions of $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$ the sum can be rewritten
|
||||
\begin{displaymath}
|
||||
\frac{1}{2}\sum_{i = 1}^n \d\tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i)
|
||||
= \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\d\mat{R}_i)
|
||||
- \frac{np}{2}\d\log|\widehat{\mat\Delta}_1|
|
||||
- \frac{nq}{2}\d\log|\widehat{\mat\Delta}_2|.
|
||||
\end{displaymath}
|
||||
This means that most of the derivative cancels out and we get
|
||||
\begin{align*}
|
||||
\d\hat{l}(\mat\alpha, \mat\beta)
|
||||
&= \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\d\mat{R}_i) \\
|
||||
&= \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}((\d\mat\beta)\mat{f}_{y_i}\t{\mat\alpha} + \mat\beta\mat{f}_{y_i}\t{(\d\mat\alpha}))) \\
|
||||
&= \sum_{i = 1}^n \t{\vec(\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i\widehat{\mat{\Delta}}_1^{-1}\mat\alpha\t{\mat{f}_{y_i}})}\d\vec\mat\beta
|
||||
+ \sum_{i = 1}^n \t{\vec(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat\beta\mat{f}_{y_i})}\d\vec\mat\alpha
|
||||
\end{align*}
|
||||
which means the gradients are
|
||||
\begin{align*}
|
||||
\nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta)
|
||||
&= \sum_{i = 1}^n \widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat\beta\mat{f}_{y_i}
|
||||
= (\ten{R}\ttm[3]\widehat{\mat{\Delta}}_1^{-1}\ttm[2]\widehat{\mat{\Delta}}_2^{-1})_{(3)}\t{(\ten{F}\ttm[2]\mat\beta)_{(3)}}\\
|
||||
\nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)
|
||||
&= \sum_{i = 1}^n \widehat{\mat{\Delta}}_2^{-1}\mat{R}_i\widehat{\mat{\Delta}}_1^{-1}\mat\alpha\t{\mat{f}_{y_i}}
|
||||
= (\ten{R}\ttm[3]\widehat{\mat{\Delta}}_1^{-1}\ttm[2]\widehat{\mat{\Delta}}_2^{-1})_{(2)}\t{(\ten{F}\ttm[3]\mat\alpha)_{(2)}}
|
||||
\end{align*}
|
||||
|
||||
\paragraph{Comparison to the general case:} There are two main differences, first the general case has a closed form solution for the gradient due to the explicit nature of the MLE estimate of $\widehat{\mat\Delta}$ compared to the mutually dependent MLE estimates $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$. On the other hand the general case has dramatically bigger dimensions of the covariance matrix ($p q \times p q$) compared to the two Kronecker components with dimensions $q \times q$ and $p \times p$. This means that in the general case there is a huge performance penalty in the dimensions of $\widehat{\mat\Delta}$ while in the other case an extra estimation is required to determine $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$.
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%%% Alternative covariance estimates %%%
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Alternative covariance estimates}
|
||||
An alternative approach is \emph{not} to use the MLE estimates for $\mat\Delta_1$, $\mat\Delta_2$ but (up to scaling) unbiased estimates.
|
||||
\begin{displaymath}
|
||||
\widetilde{\mat\Delta}_1 = \frac{1}{n}\sum_{i = 1}^n \t{\mat{R}_i}\mat{R}_i {\color{gray}\quad(q\times q)},\qquad
|
||||
\widetilde{\mat\Delta}_2 = \frac{1}{n}\sum_{i = 1}^n \mat{R}_i\t{\mat{R}_i} {\color{gray}\quad(p\times p)}.
|
||||
\end{displaymath}
|
||||
The unbiasednes comes directly from the following short computation;
|
||||
\begin{displaymath}
|
||||
(\E\widetilde{\mat\Delta}_1)_{j,k} = \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p \E \mat{R}_{i,l,j}\mat{R}_{i,l,k}
|
||||
= \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p (\mat{\Delta}_{2})_{l,l}(\mat{\Delta}_{1})_{j,k}
|
||||
= (\mat\Delta_1\tr(\mat\Delta_2))_{j,k}.
|
||||
\end{displaymath}
|
||||
which means that $\E\widetilde{\mat\Delta}_1 = \mat\Delta_1\tr(\mat\Delta_2)$ and in analogy $\E\widetilde{\mat\Delta}_2 = \mat\Delta_2\tr(\mat\Delta_1)$. Now, we need to handle the scaling which can be estimated unbiased by
|
||||
\begin{displaymath}
|
||||
\tilde{s} = \frac{1}{n}\sum_{i = 1}^n \|\mat{R}_i\|_F^2
|
||||
\end{displaymath}
|
||||
because with $\|\mat{R}_i\|_F^2 = \tr \mat{R}_i\t{\mat{R}_i} = \tr \t{\mat{R}_i}\mat{R}_i$ the scale estimate $\tilde{s} = \tr(\widetilde{\mat\Delta}_1) = \tr(\widetilde{\mat\Delta}_2)$. Then $\E\tilde{s} = \tr(\E\widetilde{\mat\Delta}_1) = \tr{\mat\Delta}_1 \tr{\mat\Delta}_2 = \tr({\mat\Delta}_1\otimes{\mat\Delta}_2)$. Leading to the estimate of the covariance as
|
||||
\begin{displaymath}
|
||||
\widetilde{\mat\Delta} = \tilde{s}^{-1}(\widetilde{\mat{\Delta}}_1\otimes\widetilde{\mat{\Delta}}_2)
|
||||
\end{displaymath}
|
||||
|
||||
\todo{ prove they are consistent, especially $\widetilde{\mat\Delta} = \tilde{s}^{-1}(\widetilde{\mat\Delta}_1\otimes\widetilde{\mat\Delta}_2)$!}
|
||||
|
||||
The hoped for a benefit is that these covariance estimates are in a closed form which means there is no need for an additional iterative estimations step. Before we start with the derivation of the gradients define the following two quantities
|
||||
\begin{align*}
|
||||
\mat{S}_1 = \frac{1}{n}\sum_{i = 1}^n \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i = \frac{1}{n}\ten{R}_{(3)}\t{(\ten{R}\ttm[2]\widetilde{\mat{\Delta}}_2^{-1})_{(3)}}\quad{\color{gray}(q\times q)}, \\
|
||||
\mat{S}_2 = \frac{1}{n}\sum_{i = 1}^n \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} = \frac{1}{n}\ten{R}_{(2)}\t{(\ten{R}\ttm[3]\widetilde{\mat{\Delta}}_1^{-1})_{(2)}}\quad{\color{gray}(p\times p)}.
|
||||
\end{align*}
|
||||
\todo{Check tensor form!}
|
||||
|
||||
Now, the matrix normal with the covariance matrix of the vectorized quantities of the form $\mat{\Delta} = s^{-1}(\mat{\Delta}_1\otimes\mat{\Delta}_2)$ has the form
|
||||
\begin{align*}
|
||||
f(\mat R)
|
||||
&= \frac{1}{\sqrt{(2\pi)^{p q}|\mat\Delta|}}\exp\left(-\frac{1}{2}\t{\vec(\mat{R})} \mat\Delta^{-1}\vec(\mat{R})\right) \\
|
||||
&= \frac{s^{p q / 2}}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{s}{2}\tr(\mat\Delta_1^{-1}\t{\mat{R}}\mat\Delta_2^{-1}\mat{R})\right)
|
||||
\end{align*}
|
||||
|
||||
The approximated log-likelihood is then
|
||||
\begin{align*}
|
||||
\tilde{l}(\mat\alpha, \mat\beta)
|
||||
&=
|
||||
-\frac{n p q}{2}\log{2\pi}
|
||||
-\frac{n}{2}\log|\widetilde{\mat{\Delta}}|
|
||||
-\frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widetilde{\mat{\Delta}}^{-1}\mat{r}_i \\
|
||||
&=
|
||||
-\frac{n p q}{2}\log{2\pi}
|
||||
+\frac{n p q}{2}\log\tilde{s}
|
||||
-\frac{n p}{2}\log|\widetilde{\mat{\Delta}}_1|
|
||||
-\frac{n q}{2}\log|\widetilde{\mat{\Delta}}_2|
|
||||
-\frac{\tilde{s}}{2}\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i).
|
||||
\end{align*}
|
||||
The second form is due to the property of the determinant for scaling and the Kronecker product giving that $|\widetilde{\mat\Delta}| = (\tilde{s}^{-1})^{p q}|\widetilde{\mat{\Delta}}_1|^p |\widetilde{\mat{\Delta}}_2|^q$ as well as an analog Kronecker decomposition as in the MLE case.
|
||||
|
||||
Note that with the following holds
|
||||
\begin{displaymath}
|
||||
\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
|
||||
= n \tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)
|
||||
= n \tr(\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2)
|
||||
= n \tr(\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1})
|
||||
= n \tr(\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1}).
|
||||
\end{displaymath}
|
||||
|
||||
The derivation of the Gradient of the approximated log-likelihood $\tilde{l}$ is tedious but straight forward. We tackle the summands separately;
|
||||
\begin{align*}
|
||||
\d\log\tilde{s} &= \tilde{s}^{-1}\d\tilde{s} = \frac{2}{n\tilde{s}}\sum_{i = 1}^n \tr(\t{\mat{R}_i}\d\mat{R}_i)
|
||||
= -\frac{2}{n\tilde{s}}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}), \\
|
||||
\d\log|\widetilde{\mat{\Delta}}_1| &=\tr(\widetilde{\mat{\Delta}}_1^{-1}\d\widetilde{\mat{\Delta}}_1) = \frac{2}{n}\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\d\mat{R}_i)
|
||||
= -\frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\d\mat{\beta}), \\
|
||||
\d\log|\widetilde{\mat{\Delta}}_2| &=\tr(\widetilde{\mat{\Delta}}_2^{-1}\d\widetilde{\mat{\Delta}}_2) = \frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\d\mat{R}_i)
|
||||
= -\frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\d\mat{\beta})
|
||||
\end{align*}
|
||||
as well as
|
||||
\begin{displaymath}
|
||||
\d\,\tilde{s}\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
|
||||
= (\d\tilde{s})\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
|
||||
+ \tilde{s}\, \d \sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i).
|
||||
\end{displaymath}
|
||||
We have
|
||||
\begin{displaymath}
|
||||
\d\tilde{s} = -\frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta})
|
||||
\end{displaymath}
|
||||
and the remaining term
|
||||
\begin{align*}
|
||||
\d\sum_{i = 1}^n\tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)
|
||||
= 2\sum_{i = 1}^n \tr(&\t{\mat{f}_{y_i}}\t{\mat{\beta }}(\mat{R}_i \widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1} + \widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i - \widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i \widetilde{\mat{\Delta}}_1^{-1})\d\mat{\alpha} \\
|
||||
+\,&\mat{f}_{y_i} \t{\mat{\alpha}}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1})\d\mat{\beta }).
|
||||
\end{align*}
|
||||
The last one is tedious but straight forward. Its computation extensively uses the symmetry of $\widetilde{\mat{\Delta}}_1$, $\widetilde{\mat{\Delta}}_2$, the cyclic property of the trace and the relation $\d\mat{A}^{-1} = -\mat{A}^{-1}(\d\mat{A})\mat{A}^{-1}$.
|
||||
|
||||
Putting it all together
|
||||
\begin{align*}
|
||||
\d\tilde{l}(\mat{\alpha}, \mat{\beta})
|
||||
&= \frac{n p q}{2}\Big(-\frac{2}{n\tilde{s}}\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}) \\
|
||||
&\hspace{3em} - \frac{n p}{2}\Big(-\frac{2}{n}\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\d\mat{\beta}) \\
|
||||
&\hspace{3em} - \frac{n q}{2}\Big(-\frac{2}{n}\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\d\mat{\beta}) \\
|
||||
&\hspace{3em} -\frac{1}{2}\Big(-\frac{2}{n}\Big)\Big(\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}) \\
|
||||
&\hspace{3em} -\frac{\tilde{s}}{2}2\sum_{i = 1}^n \tr\!\Big(\t{\mat{f}_{y_i}}\t{\mat{\beta }}(\mat{R}_i \widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1} + \widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i - \widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i \widetilde{\mat{\Delta}}_1^{-1})\d\mat{\alpha} \\
|
||||
&\hspace{3em} \hspace{4.7em} + \mat{f}_{y_i} \t{\mat{\alpha}}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1})\d\mat{\beta }\Big) \\
|
||||
%
|
||||
&= \sum_{i = 1}^n \tr\bigg(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\Big(
|
||||
-p q \tilde{s}^{-1} \mat{R}_i + p \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1} + q \widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i + \tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)\mat{R}_i \\
|
||||
&\hspace{3em} \hspace{4.7em} - \tilde{s}(\mat{R}_i \widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1} + \widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i - \widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i \widetilde{\mat{\Delta}}_1^{-1})
|
||||
\Big)\d\mat{\alpha}\bigg) \\
|
||||
&\hspace{3em}+ \sum_{i = 1}^n \tr\bigg(\mat{f}_{y_i}\t{\mat{\alpha}}\Big(
|
||||
-p q \tilde{s}^{-1} \t{\mat{R}_i} + p \widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + q \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1} + \tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)\t{\mat{R}_i} \\
|
||||
&\hspace{3em}\hspace{3em} \hspace{4.7em} - \tilde{s}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1} \t{\mat{R}_i} \widetilde{\mat{\Delta}}_2^{-1})
|
||||
\Big)\d\mat{\beta}\bigg).
|
||||
\end{align*}
|
||||
Observe that the bracketed expressions before $\d\mat{\alpha}$ and $\d\mat{\beta}$ are transposed of each other. Lets denote the expression for $\d\mat{\alpha}$ as $\mat{G}_i$ which has the form
|
||||
\begin{displaymath}
|
||||
\mat{G}_i
|
||||
= (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\mat{R}_i
|
||||
+ (q\mat{I}_p - \tilde{s}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2)\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i
|
||||
+ \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}(p\mat{I}_q - \tilde{s}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1})
|
||||
+ \tilde{s}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}
|
||||
\end{displaymath}
|
||||
and with $\mathcal{G}$ the order 3 tensor stacking the $\mat{G}_i$'s such that the first mode indexes the observation
|
||||
\begin{displaymath}
|
||||
\ten{G}
|
||||
= (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\ten{R}
|
||||
+ \ten{R}\ttm[2](q\mat{I}_p - \tilde{s}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2)\widetilde{\mat{\Delta}}_2^{-1}
|
||||
+ \ten{R}\ttm[3](p\mat{I}_q - \tilde{s}\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)\widetilde{\mat{\Delta}}_1^{-1}
|
||||
+ \tilde{s}\ten{R}\ttm[2]\widetilde{\mat{\Delta}}_2^{-1}\ttm[3]\widetilde{\mat{\Delta}}_1^{-1}
|
||||
\end{displaymath}
|
||||
This leads to the following form of the differential of $\tilde{l}$ given by
|
||||
\begin{displaymath}
|
||||
\d\tilde{l}(\mat{\alpha}, \mat{\beta})
|
||||
= \sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{G}_i\d\mat{\alpha})
|
||||
+ \sum_{i = 1}^n \tr(\mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{G}_i}\d\mat{\beta})
|
||||
\end{displaymath}
|
||||
and therefore the gradients
|
||||
\begin{align*}
|
||||
\nabla_{\mat{\alpha}}\tilde{l}(\mat{\alpha}, \mat{\beta}) &= \sum_{i = 1}^n \t{\mat{G}_i}\mat{\beta}\mat{f}_{y_i}
|
||||
= \ten{G}_{(3)}\t{(\ten{F}\ttm[2]\beta)_{(3)}}, \\
|
||||
\nabla_{\mat{\beta}} \tilde{l}(\mat{\alpha}, \mat{\beta}) &= \sum_{i = 1}^n \mat{G}_i\mat{\alpha}\t{\mat{f}_{y_i}}
|
||||
= \ten{G}_{(2)}\t{(\ten{F}\ttm[3]\beta)_{(2)}}.
|
||||
\end{align*}
|
||||
|
||||
\todo{check the tensor version of the gradient!!!}
|
||||
|
||||
\newpage
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%%% Numerical Examples %%%
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Numerical Examples}
|
||||
The first example (which by it self is \emph{not} exemplary) is the estimation with parameters $n = 200$, $p = 11$, $q = 5$, $k = 14$ and $r = 9$. The ``true'' matrices $\mat\alpha$, $\mat\beta$ generated by sampling there elements i.i.d. standard normal like the responses $y$. Then, for each observation, $\mat{f}_y$ is computed as $\sin(s_{i, j} y + o_{i j})$ \todo{ properly describe} to fill the elements of $\mat{f}_y$. Then the $\mat{X}$'s are samples as
|
||||
\begin{displaymath}
|
||||
\mat{X} = \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}, \qquad \vec{\mat{\epsilon}} \sim \mathbb{N}_{p q}(\mat{0}, \mat{\Delta})
|
||||
\end{displaymath}
|
||||
where $\mat{\Delta}_{i j} = 0.5^{|i - j|}$ for $i, j = 1, ..., p q$.
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics{loss_Ex01.png}
|
||||
\end{figure}
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics{estimates_Ex01.png}
|
||||
\end{figure}
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics{Delta_Ex01.png}
|
||||
\end{figure}
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics{hist_Ex01.png}
|
||||
\end{figure}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%%% Bib and Index %%%
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\printindex
|
||||
\nocite{*}
|
||||
\printbibliography
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%%% Appendix %%%
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\appendix
|
||||
\section{Matrix Differential Rules}
|
||||
Let $\mat A$ be a square matrix (and invertible if needed) and $|.|$ stands for the determinant
|
||||
\begin{align*}
|
||||
\d\log\mat A &= \frac{1}{|\mat A|}\d\mat{A} \\
|
||||
\d|\mat A| &= |\mat A|\tr \mat{A}^{-1}\d\mat A \\
|
||||
\d\log|\mat A| &= \tr\mat{A}^{-1}\d\mat A \\
|
||||
\d\mat{X}^{-1} &= -\mat{X}^{-1}(\d\mat{X})\mat{X}^{-1}
|
||||
\end{align*}
|
||||
|
||||
\section{Useful Matrix Identities}
|
||||
In this section we summarize a few useful matrix identities, for more details see for example \cite{MatrixAlgebra-AbadirMagnus2005}.
|
||||
|
||||
For two matrices $\mat A$ of dimensions $q\times r$ and $\mat B$ of dimensions $p\times k$ holds
|
||||
\begin{equation}\label{eq:vecKron}
|
||||
\vec(\mat A\kron\mat B) = (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p)(\vec\mat A\kron\vec\mat B).
|
||||
\end{equation}
|
||||
|
||||
Let $\mat A$ be a $p\times p$ dimensional non-singular matrix. Furthermore, let $\mat a, \mat b$ be $p$ vectors such that $\t{\mat b}A^{-1}\mat a\neq -1$, then
|
||||
\begin{displaymath}
|
||||
(\mat A + \mat a\t{\mat b})^{-1} = \mat{A}^{-1} - \frac{1}{1 + \t{\mat b}A^{-1}\mat a}\mat{A}^{-1}\mat{a}\t{\mat{b}}\mat{A}^{-1}
|
||||
\end{displaymath}
|
||||
as well as
|
||||
\begin{displaymath}
|
||||
\det(\mat A + \mat a\t{\mat b}) = \det(\mat A)(1 + \t{\mat b}{\mat A}^{-1}\mat a)
|
||||
\end{displaymath}
|
||||
which even holds in the case $\t{\mat b}A^{-1}\mat a = -1$. This is known as Sylvester's determinant theorem.
|
||||
|
||||
|
||||
\section{Commutation Matrix and Permutation Identities}
|
||||
\begin{center}
|
||||
Note: In this section we use 0-indexing for the sake of simplicity!
|
||||
\end{center}
|
||||
In this section we summarize relations between the commutation matrix and corresponding permutation. We also list some extensions to ``simplify'' or represent some term. This is mostly intended for implementation purposes and understanding of terms occurring in the computations.
|
||||
|
||||
Let $\mat A$ be an arbitrary $p\times q$ matrix. The permutation matrix $\mat K_{p, q}$ satisfies
|
||||
\begin{displaymath}
|
||||
\mat{K}_{p, q}\vec{\mat{A}} = \vec{\t{\mat{A}}} \quad\Leftrightarrow\quad (\vec{\mat{A}})_{\pi_{p, q}(i)} = (\vec{\t{\mat{A}}})_{i}, \quad\text{for } i = 0, ..., p q - 1
|
||||
\end{displaymath}
|
||||
where $\pi_{p, q}$ is a permutation of the indices $i = 0, ..., p q - 1$ such that
|
||||
\begin{displaymath}
|
||||
\pi_{p, q}(i + j p) = j + i q, \quad\text{for }i = 0, ..., p - 1; j = 0, ..., q - 1.
|
||||
\end{displaymath}
|
||||
|
||||
\begin{table}[!htp]
|
||||
\centering
|
||||
\begin{minipage}{0.8\textwidth}
|
||||
\centering
|
||||
\begin{tabular}{l c l}
|
||||
$\mat{K}_{p, q}$ & $\hat{=}$ & $\pi_{p, q}(i + j p) = j + i q$ \\
|
||||
$\mat{I}_r\kron\mat{K}_{p, q}$ & $\hat{=}$ & $\tilde{\pi}_{p, q, r}(i + j p + k p q) = j + i q + k p q$ \\
|
||||
$\mat{K}_{p, q}\kron\mat{I}_r$ & $\hat{=}$ & $\hat{\pi}_{p, q, r}(i + j p + k p q) = r(j + i q) + k$
|
||||
\end{tabular}
|
||||
\caption{\label{tab:commutation-permutation}Commutation matrix terms and corresponding permutations. Indices are all 0-indexed with the ranges; $i = 0, ..., p - 1$, $j = 0, ..., q - 1$ and $k = 0, ..., r - 1$.}
|
||||
\end{minipage}
|
||||
\end{table}
|
||||
|
||||
|
||||
|
||||
\section{Matrix and Tensor Operations}
|
||||
|
||||
The \emph{Kronecker product}\index{Operations!Kronecker@$\kron$ Kronecker product} is denoted as $\kron$ and the \emph{Hadamard product} uses the symbol $\circ$. We also need the \emph{Khatri-Rao product}\index{Operations!KhatriRao@$\hada$ Khatri-Rao product}
|
||||
$\hada$ as well as the \emph{Transposed Khatri-Rao product} $\odot_t$ (or \emph{Face-Splitting product}). There is also the \emph{$n$-mode Tensor Matrix Product}\index{Operations!ttm@$\ttm[n]$ $n$-mode tensor product} denoted by $\ttm[n]$ in conjunction with the \emph{$n$-mode Matricization} of a Tensor $\mat{T}$ written as $\mat{T}_{(n)}$, which is a matrix. See below for definitions and examples of these operations.\todo{ Definitions and Examples}
|
||||
|
||||
\todo{ resolve confusion between Khatri-Rao, Column-wise Kronecker / Khatri-Rao, Row-wise Kronecker / Khatri-Rao, Face-Splitting Product, .... Yes, its a mess.}
|
||||
\paragraph{Kronecker Product $\kron$:}
|
||||
\paragraph{Khatri-Rao Product $\hada$:}
|
||||
\paragraph{Transposed Khatri-Rao Product $\odot_t$:} This is also known as the Face-Splitting Product and is the row-wise Kronecker product of two matrices. If relates to the Column-wise Kronecker Product through
|
||||
\begin{displaymath}
|
||||
\t{(\mat{A}\odot_{t}\mat{B})} = \t{\mat{A}}\hada\t{\mat{B}}
|
||||
\end{displaymath}
|
||||
|
||||
\paragraph{$n$-mode unfolding:} \emph{Unfolding}, also known as \emph{flattening} or \emph{matricization}, is an reshaping of a tensor into a matrix with rearrangement of the elements such that mode $n$ corresponds to columns of the result matrix and all other modes are vectorized in the rows. Let $\ten{T}$ be a tensor of order $m$ with dimensions $t_1\times ... \times t_n\times ... \times t_m$ and elements indexed by $(i_1, ..., i_n, ..., i_m)$. The $n$-mode flattening, denoted $\ten{T}_{(n)}$, is defined as a $(t_n, \prod_{k\neq n}t_k)$ matrix with element indices $(i_n, j)$ such that $j = \sum_{k = 1, k\neq n}^m i_k\prod_{l = 1, l\neq n}^{k - 1}t_l$.
|
||||
\todo{ give an example!}
|
||||
|
||||
\paragraph{$n$-mode Tensor Product $\ttm[n]$:}
|
||||
The \emph{$n$-mode tensor product} $\ttm[n]$ between a tensor $\mat{T}$ of order $m$ with dimensions $t_1\times t_2\times ... \times t_n\times ... \times t_m$ and a $p\times t_n$ matrix $\mat{M}$ is defined element-wise as
|
||||
\begin{displaymath}
|
||||
(\ten{T}\ttm[n] \mat{M})_{i_1, ..., i_{n-1}, j, i_{n+1}, ..., i_m} = \sum_{k = 1}^{t_n} \ten{T}_{i_1, ..., i_{n-1}, k, i_{n+1}, ..., i_m} \mat{M}_{j, k}
|
||||
\end{displaymath}
|
||||
where $i_1, ..., i_{n-1}, i_{n+1}, ..., i_m$ run from $1$ to $t_1, ..., t_{n-1}, t_{n+1}, ..., t_m$, respectively. Furthermore, the $n$-th fiber index $j$ of the product ranges from $1$ to $p$. This gives a new tensor $\mat{T}\ttm[n]\mat{M}$ of order $m$ with dimensions $t_1\times t_2\times ... \times p\times ... \times t_m$.
|
||||
|
||||
\begin{example}[Matrix Multiplication Analogs]
|
||||
Let $\mat{A}$, $\mat{B}$ be two matrices with dimensions $t_1\times t_2$ and $p\times q$, respectively. Then $\mat{A}$ is also a tensor of order $2$, now the $1$-mode and $2$-mode products are element wise given by
|
||||
\begin{align*}
|
||||
(\mat{A}\ttm[1] \mat{B})_{i,j} &= \sum_{l = 1}^{t_1} \mat{A}_{l,j}\mat{B}_{i,l}
|
||||
= (\mat{B}\mat{A})_{i,j}
|
||||
& \text{for }t_1 = q, \\
|
||||
(\mat{A}\ttm[2] \mat{B})_{i,j} &= \sum_{l = 1}^{t_2} \mat{A}_{i,l}\mat{B}_{j,l}
|
||||
= (\mat{A}\t{\mat{B}})_{i,j} = \t{(\mat{B}\t{\mat{A}})}_{i,j}
|
||||
& \text{for }t_2 = q.
|
||||
\end{align*}
|
||||
In other words, the $1$-mode product equals $\mat{A}\ttm[1] \mat{B} = \mat{B}\mat{A}$ and the $2$-mode is $\mat{A}\ttm[2] \mat{B} = \t{(\mat{B}\t{\mat{A}})}$ in the case of the tensor $\mat{A}$ being a matrix.
|
||||
\end{example}
|
||||
|
||||
\begin{example}[Order Three Analogs]
|
||||
Let $\mat{A}$ be a tensor of the form $t_1\times t_2\times t_3$ and $\mat{B}$ a matrix of dimensions $p\times q$, then the $n$-mode products have the following look
|
||||
\begin{align*}
|
||||
(\mat{A}\ttm[1]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_1} \mat{A}_{l,j,k}\mat{B}_{i,l} & \text{for }t_1 = q, \\
|
||||
(\mat{A}\ttm[2]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_2} \mat{A}_{i,l,k}\mat{B}_{j,l} \equiv (\mat{B}\mat{A}_{i,:,:})_{j,k} & \text{for }t_2 = q, \\
|
||||
(\mat{A}\ttm[3]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_3} \mat{A}_{i,j,l}\mat{B}_{k,l} \equiv \t{(\mat{B}\t{\mat{A}_{i,:,:}})}_{j,k} & \text{for }t_3 = q.
|
||||
\end{align*}
|
||||
\end{example}
|
||||
|
||||
Letting $\ten{F}$ be the $3$-tensor of dimensions $n\times k\times r$ such that $\ten{F}_{i,:,:} = \mat{f}_{y_i}$, then
|
||||
\begin{displaymath}
|
||||
\mat{\beta}\mat{f}_{y_i}\t{\mat{\alpha}} = (\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha})_{i,:,:}
|
||||
\end{displaymath}
|
||||
or in other words, the $i$-th slice of the tensor product $\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha}$ contains $\mat{\beta}\mat{f}_{y_i}\t{\mat{\alpha}}$ for $i = 1, ..., n$.
|
||||
Another analog way of writing this is
|
||||
\begin{displaymath}
|
||||
(\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha})_{(1)} = \mathbb{F}_{y}(\t{\mat{\alpha}}\kron\t{\mat{\beta}})
|
||||
\end{displaymath}
|
||||
|
||||
\section{Equivalencies}
|
||||
In this section we give a short summary of alternative but equivalent operations.
|
||||
Using the notation $\widehat{=}$ to indicate that two expressions are identical in the sense that they contain the same element in the same order but may have different dimensions. Meaning, when vectorizing ether side of $\widehat{=}$, they are equal ($\mat{A}\widehat{=}\mat{B}\ :\Leftrightarrow\ \vec{\mat{A}} = \vec{\mat{B}}$).
|
||||
|
||||
Therefore, we use $\mat{A}, \mat{B}, \mat{X}, \mat{F}, \mat{R}, ...$ for matrices. 3-Tensors are written as $\ten{A}, \ten{B}, \ten{T}, \ten{X}, \ten{F}, \ten{R}, ...$.
|
||||
|
||||
\begin{align*}
|
||||
\ten{T}\ttm[3]\mat{A}\ &{\widehat{=}}\ \mat{T}\t{\mat A} & \ten{T}(n, p, q)\ \widehat{=}\ \mat{T}(n p, q), \mat{A}(p, q) \\
|
||||
\ten{T}\ttm[2]\mat{B}\ &{\widehat{=}}\ \mat{B}\ten{T}_{(2)} & \ten{T}(n, p, q), \ten{T}_{(2)}(p, n q), \mat{B}(q, p)
|
||||
\end{align*}
|
||||
|
||||
\section{Matrix Valued Normal Distribution}
|
||||
A random variable $\mat{X}$ of dimensions $p\times q$ is \emph{Matrix-Valued Normal Distribution}, denoted
|
||||
\begin{displaymath}
|
||||
\mat{X}\sim\mathcal{MN}_{p\times q}(\mat{\mu}, \mat{\Delta}_2, \mat{\Delta}_1),
|
||||
\end{displaymath}
|
||||
if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otimes\mat\Delta_2)$. Note the order of the covariance matrices $\mat\Delta_1, \mat\Delta_2$. Its density is given by
|
||||
\begin{displaymath}
|
||||
f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right).
|
||||
\end{displaymath}
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%%% Reference Summaries %%%
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
\section{Reference Summaries}
|
||||
This section contains short summaries of the main references with each sub-section concerning one paper.
|
||||
|
||||
\subsection{}
|
||||
|
||||
\subsection{Generalized Tensor Decomposition With Features on Multiple Modes}
|
||||
The \cite{TensorDecomp-HuLeeWang2022} paper proposes a multi-linear conditional mean model for a constraint rank tensor decomposition. Let the responses $\ten{Y}\in\mathbb{R}^{d_1\times ... \times\d_K}$ be an order $K$ tensor. Associated with each mode $k\in[K]$ they assume feature matrices $\mat{X}_k\in\mathbb{R}^{d_k\times p_k}$. Now, they assume that conditional on the feature matrices $\mat{X}_k$ the entries of the tensor $\ten{Y}$ are independent realizations. The rank constraint is specified through $\mat{r} = (r_1, ..., r_K)$, then the model is given by
|
||||
\begin{displaymath}
|
||||
\E(\ten{Y} | \mat{X}_1, ..., \mat{X}_K) = f(\ten{C}\times\{ \mat{X}_1\mat{M}_1, ..., \mat{X}_K\mat{M}_K \}),\qquad \t{\mat{M}_k}\mat{M}_k = \mat{I}_{r_k}\ \forall k\in[K].
|
||||
\end{displaymath}
|
||||
The order $K$ tensor $\ten{C}\in\mathbb{R}^{r_1\times...\times r_K}$ is an unknown full-rank core tensor and the matrices $\mat{M}_k\in\mathbb{R}^{p_k\times r_k}$ are unknown factor matrices. The function $f$ is applied element wise and serves as the link function based on the assumed distribution family of the tensor entries. Finally, the operation $\times$ denotes the tensor-by-matrix product using a short hand
|
||||
\begin{displaymath}
|
||||
\ten{C}\times\{ \mat{X}_1\mat{M}_1, ..., \mat{X}_K\mat{M}_K \}
|
||||
= \ten{C}\ttm[1]\mat{X}_1\mat{M}_1\ ...\ttm[K]\mat{X}_K\mat{M}_K
|
||||
\end{displaymath}
|
||||
with $\ttm[k]$ denoting the $k$-mode tensor matrix product.
|
||||
|
||||
The algorithm for estimation of $\ten{C}$ and $\mat{M}_1, ..., \mat{M}_K$ assumes the individual conditional entries of $\ten{Y}$ to be independent and to follow a generalized linear model with link function $f$. The proposed algorithm is an iterative algorithm for minimizing the negative log-likelihood
|
||||
\begin{displaymath}
|
||||
l(\ten{C}, \mat{M}_1, ..., \mat{M}_K) = \langle \ten{Y}, \Theta \rangle - \sum_{i_1, ..., i_K} b(\Theta_{i_1, ..., i_K}), \qquad \Theta = \ten{C}\times\{ \mat{X}_1\mat{M}_1, ..., \mat{X}_K\mat{M}_K \}
|
||||
\end{displaymath}
|
||||
where $b = f'$ it the derivative of the canonical link function $f$ in the generalized linear model the conditioned entries of $\ten{Y}$ follow. The algorithm utilizes the higher-order SVD (HOSVD) to enforce the rank-constraint.
|
||||
|
||||
The main benefit is that this approach generalizes well to a multitude of different structured data sets.
|
||||
|
||||
\todo{ how does this relate to the $\mat{X} = \mat{\mu} + \mat{\beta}\mat{f}_y\t{\mat{\alpha}} + \mat{\epsilon}$ model.}
|
||||
|
||||
\end{document}
|
|
@ -0,0 +1,985 @@
|
|||
library(tensorPredictors)
|
||||
library(dplyr)
|
||||
library(ggplot2)
|
||||
|
||||
### Exec all methods for a given data set and collect logs ###
|
||||
sim <- function(X, Fy, shape, alpha.true, beta.true, max.iter = 500L) {
|
||||
|
||||
# Logger creator
|
||||
logger <- function(name) {
|
||||
eval(substitute(function(iter, loss, alpha, beta, ...) {
|
||||
hist[iter + 1L, ] <<- c(
|
||||
iter = iter,
|
||||
loss = loss,
|
||||
dist = (dist <- dist.subspace(c(kronecker(alpha.true, beta.true)),
|
||||
c(kronecker(alpha, beta)))),
|
||||
dist.alpha = (dist.alpha <- dist.subspace(c(alpha.true), c(alpha))),
|
||||
dist.beta = (dist.beta <- dist.subspace(c( beta.true), c(beta ))),
|
||||
norm.alpha = norm(alpha, "F"),
|
||||
norm.beta = norm(beta, "F")
|
||||
)
|
||||
|
||||
cat(sprintf(
|
||||
"%3d | l = %-12.4f - dist = %-.4e - alpha(%d, %d) = %-.4e - beta(%d, %d) = %-.4e\n",
|
||||
iter, loss,
|
||||
dist,
|
||||
nrow(alpha), ncol(alpha), dist.alpha,
|
||||
nrow(beta), ncol(beta), dist.beta
|
||||
))
|
||||
}, list(hist = as.symbol(paste0("hist.", name)))))
|
||||
}
|
||||
|
||||
# Initialize logger history targets
|
||||
hist.base <- hist.new <- hist.momentum <- # hist.kron <-
|
||||
data.frame(iter = seq(0L, max.iter),
|
||||
loss = NA, dist = NA, dist.alpha = NA, dist.beta = NA,
|
||||
norm.alpha = NA, norm.beta = NA
|
||||
)
|
||||
hist.kron <- NULL # TODO: fit kron version
|
||||
|
||||
# Base (old)
|
||||
kpir.base(X, Fy, shape, max.iter = max.iter, logger = logger("base"))
|
||||
|
||||
# New (simple Gradient Descent)
|
||||
kpir.new(X, Fy, shape, max.iter = max.iter, logger = logger("new"))
|
||||
|
||||
# Gradient Descent with Nesterov Momentum
|
||||
kpir.momentum(X, Fy, shape, max.iter = max.iter, logger = logger("momentum"))
|
||||
|
||||
# # Residual Covariance Kronecker product assumpton version
|
||||
# kpir.kron(X, Fy, shape, max.iter = max.iter, logger = logger("kron"))
|
||||
|
||||
# Add method tags
|
||||
hist.base$type <- factor("base")
|
||||
hist.new$type <- factor("new")
|
||||
hist.momentum$type <- factor("momentum")
|
||||
# hist.kron$type <- factor("kron")
|
||||
|
||||
# Combine results and return
|
||||
rbind(hist.base, hist.new, hist.momentum, hist.kron)
|
||||
}
|
||||
|
||||
## Generate some test data / DEBUG
|
||||
n <- 200 # Sample Size
|
||||
p <- sample(1:15, 1) # 11
|
||||
q <- sample(1:15, 1) # 3
|
||||
k <- sample(1:15, 1) # 7
|
||||
r <- sample(1:15, 1) # 5
|
||||
print(c(n, p, q, k, r))
|
||||
|
||||
hist <- NULL
|
||||
for (rep in 1:20) {
|
||||
alpha.true <- alpha <- matrix(rnorm(q * r), q, r)
|
||||
beta.true <- beta <- matrix(rnorm(p * k), p, k)
|
||||
y <- rnorm(n)
|
||||
Fy <- do.call(cbind, Map(function(slope, offset) {
|
||||
sin(slope * y + offset)
|
||||
},
|
||||
head(rep(seq(1, ceiling(0.5 * k * r)), each = 2), k * r),
|
||||
head(rep(c(0, pi / 2), ceiling(0.5 * k * r)), k * r)
|
||||
))
|
||||
Delta <- 0.5^abs(outer(seq_len(p * q), seq_len(p * q), `-`))
|
||||
X <- tcrossprod(Fy, kronecker(alpha, beta)) + CVarE:::rmvnorm(n, sigma = Delta)
|
||||
|
||||
hist.sim <- sim(X, Fy, shape = c(p, q, k, r), alpha.true, beta.true)
|
||||
hist.sim$repetition <- rep
|
||||
|
||||
hist <- rbind(hist, hist.sim)
|
||||
}
|
||||
|
||||
|
||||
saveRDS(hist, file = "AR.rds")
|
||||
hist$repetition <- factor(hist$repetition)
|
||||
|
||||
ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) +
|
||||
geom_line(aes(y = loss)) +
|
||||
geom_point(data = with(sub <- subset(hist, !is.na(loss)),
|
||||
aggregate(sub, list(type, repetition), tail, 1)
|
||||
), aes(y = loss)) +
|
||||
labs(
|
||||
title = bquote(paste("Optimization Objective: negative log-likelihood ",
|
||||
l(hat(alpha), hat(beta)))),
|
||||
subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ",
|
||||
"20 repetitions, ", n == .(n), ", ",
|
||||
p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))),
|
||||
x = "nr. of iterations",
|
||||
y = expression(l(hat(alpha), hat(beta))),
|
||||
color = "method"
|
||||
) +
|
||||
theme(legend.position = "bottom")
|
||||
|
||||
dev.print(png, file = "sim01_loss.png", width = 768, height = 768, res = 125)
|
||||
|
||||
|
||||
ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) +
|
||||
geom_line(aes(y = dist)) +
|
||||
geom_point(data = with(sub <- subset(hist, !is.na(dist)),
|
||||
aggregate(sub, list(type, repetition), tail, 1)
|
||||
), aes(y = dist)) +
|
||||
labs(
|
||||
title = bquote(paste("Distance of estimate ", hat(B), " to true ", B == alpha %*% beta)),
|
||||
subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ",
|
||||
"20 repetitions, ", n == .(n), ", ",
|
||||
p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))),
|
||||
x = "nr. of iterations",
|
||||
y = expression(abs(B * B^T - hat(B) * hat(B)^T)),
|
||||
color = "method"
|
||||
) +
|
||||
theme(legend.position = "bottom")
|
||||
|
||||
dev.print(png, file = "sim01_dist.png", width = 768, height = 768, res = 125)
|
||||
|
||||
|
||||
ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) +
|
||||
geom_line(aes(y = dist.alpha)) +
|
||||
geom_point(data = with(sub <- subset(hist, !is.na(dist.alpha)),
|
||||
aggregate(sub, list(type, repetition), tail, 1)
|
||||
), aes(y = dist.alpha)) +
|
||||
labs(
|
||||
title = bquote(paste("Distance of estimate ", hat(alpha), " to true ", alpha)),
|
||||
subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ",
|
||||
"20 repetitions, ", n == .(n), ", ",
|
||||
p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))),
|
||||
x = "nr. of iterations",
|
||||
y = expression(abs(alpha * alpha^T - hat(alpha) * hat(alpha)^T)),
|
||||
color = "method"
|
||||
) +
|
||||
theme(legend.position = "bottom")
|
||||
|
||||
dev.print(png, file = "sim01_dist_alpha.png", width = 768, height = 768, res = 125)
|
||||
|
||||
|
||||
ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) +
|
||||
geom_line(aes(y = dist.beta)) +
|
||||
geom_point(data = with(sub <- subset(hist, !is.na(dist.beta)),
|
||||
aggregate(sub, list(type, repetition), tail, 1)
|
||||
), aes(y = dist.beta)) +
|
||||
labs(
|
||||
title = bquote(paste("Distance of estimate ", hat(beta), " to true ", beta)),
|
||||
subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ",
|
||||
"20 repetitions, ", n == .(n), ", ",
|
||||
p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))),
|
||||
x = "nr. of iterations",
|
||||
y = expression(abs(beta * beta^T - hat(beta) * hat(beta)^T)),
|
||||
color = "method"
|
||||
) +
|
||||
theme(legend.position = "bottom")
|
||||
|
||||
dev.print(png, file = "sim01_dist_beta.png", width = 768, height = 768, res = 125)
|
||||
|
||||
|
||||
ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) +
|
||||
geom_line(aes(y = norm.alpha)) +
|
||||
geom_point(data = with(sub <- subset(hist, !is.na(norm.alpha)),
|
||||
aggregate(sub, list(type, repetition), tail, 1)
|
||||
), aes(y = norm.alpha)) +
|
||||
labs(
|
||||
title = expression(paste("Norm of ", hat(alpha))),
|
||||
subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ",
|
||||
"20 repetitions, ", n == .(n), ", ",
|
||||
p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))),
|
||||
x = "nr. of iterations",
|
||||
y = expression(abs(hat(alpha))[F]),
|
||||
color = "method"
|
||||
) +
|
||||
theme(legend.position = "bottom")
|
||||
|
||||
dev.print(png, file = "sim01_norm_alpha.png", width = 768, height = 768, res = 125)
|
||||
|
||||
ggplot(hist, aes(x = iter, color = type, group = interaction(type, repetition))) +
|
||||
geom_line(aes(y = norm.beta)) +
|
||||
geom_point(data = with(sub <- subset(hist, !is.na(norm.beta)),
|
||||
aggregate(sub, list(type, repetition), tail, 1)
|
||||
), aes(y = norm.beta)) +
|
||||
labs(
|
||||
title = expression(paste("Norm of ", hat(beta))),
|
||||
subtitle = bquote(paste(Delta[i][j] == 0.25, " * ", 0.5^abs(i - j), ", ",
|
||||
"20 repetitions, ", n == .(n), ", ",
|
||||
p == .(p), ", ", q == .(q), ", ", k == .(k), ", ", r == .(r))),
|
||||
x = "nr. of iterations",
|
||||
y = expression(abs(hat(beta))[F]),
|
||||
color = "method"
|
||||
) +
|
||||
theme(legend.position = "bottom")
|
||||
|
||||
dev.print(png, file = "sim01_norm_beta.png", width = 768, height = 768, res = 125)
|
||||
|
||||
|
||||
|
||||
# local({
|
||||
# par(mfrow = c(2, 3), oma = c(2, 1, 1, 1), mar = c(3.1, 2.1, 2.1, 1.1), lwd = 2)
|
||||
# plot(c(1, max.iter), range(c(hist.base$loss, hist.new$loss, hist.momentum$loss, hist.kron$loss), na.rm = TRUE),
|
||||
# type = "n", log = "x", main = "loss")
|
||||
# lines( hist.base$loss, col = 2)
|
||||
# lines( hist.new$loss, col = 3)
|
||||
# lines(hist.momentum$loss, col = 4)
|
||||
# lines( hist.kron$loss, col = 5)
|
||||
# yrange <- range(c(hist.base$step.size, hist.new$step.size, hist.momentum$step.size, hist.kron$step.size),
|
||||
# na.rm = TRUE)
|
||||
# plot(c(1, max.iter), yrange,
|
||||
# type = "n", log = "x", main = "step.size")
|
||||
# lines( hist.base$step.size, col = 2)
|
||||
# lines( hist.new$step.size, col = 3)
|
||||
# lines(hist.momentum$step.size, col = 4)
|
||||
# lines( hist.kron$step.size, col = 5)
|
||||
# # lines( hist.base$step.size, col = 4) # there is no step.size
|
||||
# plot(0, 0, type = "l", bty = "n", xaxt = "n", yaxt = "n")
|
||||
# legend("topleft", legend = c("Base", "GD", "GD + Momentum", "Kron + GD + Momentum"), col = 2:5,
|
||||
# lwd = par("lwd"), xpd = TRUE, horiz = FALSE, cex = 1.2, bty = "n",
|
||||
# x.intersp = 1, y.intersp = 1.5)
|
||||
# # xpd = TRUE makes the legend plot to the figure
|
||||
# plot(c(1, max.iter), range(c(hist.base$dist, hist.new$dist, hist.momentum$dist, hist.kron$dist), na.rm = TRUE),
|
||||
# type = "n", log = "x", main = "dist")
|
||||
# lines( hist.base$dist, col = 2)
|
||||
# lines( hist.new$dist, col = 3)
|
||||
# lines(hist.momentum$dist, col = 4)
|
||||
# lines( hist.kron$dist, col = 5)
|
||||
# plot(c(1, max.iter), range(c(hist.base$dist.alpha, hist.new$dist.alpha, hist.momentum$dist.alpha, hist.kron$dist.alpha), na.rm = TRUE),
|
||||
# type = "n", log = "x", main = "dist.alpha")
|
||||
# lines( hist.base$dist.alpha, col = 2)
|
||||
# lines( hist.new$dist.alpha, col = 3)
|
||||
# lines(hist.momentum$dist.alpha, col = 4)
|
||||
# lines( hist.kron$dist.alpha, col = 5)
|
||||
# plot(c(1, max.iter), range(c(hist.base$dist.beta, hist.new$dist.beta, hist.momentum$dist.beta, hist.kron$dist.beta), na.rm = TRUE),
|
||||
# type = "n", log = "x", main = "dist.beta")
|
||||
# lines( hist.base$dist.beta, col = 2)
|
||||
# lines( hist.new$dist.beta, col = 3)
|
||||
# lines(hist.momentum$dist.beta, col = 4)
|
||||
# lines( hist.kron$dist.beta, col = 5)
|
||||
# # par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE)
|
||||
# # plot(0, 0, type = 'l', bty = 'n', xaxt = 'n', yaxt = 'n')
|
||||
# # legend('bottom', legend = c('GD', 'GD + Nesterov Momentum', 'Alternating'), col = 2:4,
|
||||
# # lwd = 5, xpd = TRUE, horiz = TRUE, cex = 1, seg.len = 1, bty = 'n')
|
||||
# # # xpd = TRUE makes the legend plot to the figure
|
||||
# })
|
||||
# dev.print(png, file = "loss.png", width = 768, height = 768, res = 125)
|
||||
|
||||
# with(list(a1 = alpha, a2 = fit.base$alpha, a3 = fit.new$alpha, a4 = fit.momentum$alpha,
|
||||
# b1 = beta, b2 = fit.base$beta, b3 = fit.new$beta, b4 = fit.momentum$beta), {
|
||||
# par(mfrow = c(2, 4))
|
||||
# a2 <- sign(sum(sign(a1 * a2))) * a2
|
||||
# a3 <- sign(sum(sign(a1 * a3))) * a3
|
||||
# a4 <- sign(sum(sign(a1 * a4))) * a4
|
||||
# b2 <- sign(sum(sign(b1 * b2))) * b2
|
||||
# b3 <- sign(sum(sign(b1 * b3))) * b3
|
||||
# b4 <- sign(sum(sign(b1 * b4))) * b4
|
||||
|
||||
# matrixImage(a1, main = expression(alpha))
|
||||
# matrixImage(a2, main = expression(paste(hat(alpha)["Base"])))
|
||||
# matrixImage(a3, main = expression(paste(hat(alpha)["GD"])))
|
||||
# matrixImage(a4, main = expression(paste(hat(alpha)["GD+Nest"])))
|
||||
# matrixImage(b1, main = expression(beta))
|
||||
# matrixImage(b2, main = expression(paste(hat(beta)["Base"])))
|
||||
# matrixImage(b3, main = expression(paste(hat(beta)["GD"])))
|
||||
# matrixImage(b4, main = expression(paste(hat(beta)["GD+Nest"])))
|
||||
# })
|
||||
# dev.print(png, file = "estimates.png", width = 768, height = 768, res = 125)
|
||||
|
||||
|
||||
# with(list(d1 = Delta, d2 = fit.base$Delta, d3 = fit.new$Delta, d4 = fit.momentum$Delta), {
|
||||
# par(mfrow = c(2, 2))
|
||||
|
||||
# matrixImage(d1, main = expression(Delta))
|
||||
# matrixImage(d2, main = expression(hat(Delta)["Base"]))
|
||||
# matrixImage(d3, main = expression(hat(Delta)["GD"]))
|
||||
# matrixImage(d4, main = expression(hat(Delta)["GD+Nest"]))
|
||||
# })
|
||||
# dev.print(png, file = "Delta.png", width = 768, height = 768, res = 125)
|
||||
|
||||
# with(list(a1 = alpha, a2 = fit.base$alpha, a3 = fit.new$alpha, a4 = fit.momentum$alpha,
|
||||
# b1 = beta, b2 = fit.base$beta, b3 = fit.new$beta, b4 = fit.momentum$beta), {
|
||||
# par(mfrow = c(2, 2))
|
||||
|
||||
# matrixImage(kronecker(a1, b1), main = expression(B))
|
||||
# matrixImage(kronecker(a2, b2), main = expression(hat(B)["Base"]))
|
||||
# matrixImage(kronecker(a3, b3), main = expression(hat(B)["GD"]))
|
||||
# matrixImage(kronecker(a4, b4), main = expression(hat(B)["GD+Nest"]))
|
||||
# })
|
||||
# dev.print(png, file = "B.png", width = 768, height = 768, res = 125)
|
||||
|
||||
# with(list(a1 = alpha, a2 = fit.base$alpha, a3 = fit.new$alpha, a4 = fit.momentum$alpha,
|
||||
# b1 = beta, b2 = fit.base$beta, b3 = fit.new$beta, b4 = fit.momentum$beta), {
|
||||
# par(mfrow = c(3, 1), lwd = 1)
|
||||
|
||||
# d2 <- kronecker(a1, b1) - kronecker(a2, b2)
|
||||
# d3 <- kronecker(a1, b1) - kronecker(a3, b3)
|
||||
# d4 <- kronecker(a1, b1) - kronecker(a4, b4)
|
||||
# xlim <- c(-1, 1) * max(abs(c(d2, d3, d4)))
|
||||
# breaks <- seq(xlim[1], xlim[2], len = 41)
|
||||
|
||||
# hist(d2, main = expression(paste(base, (B - hat(B))[i])),
|
||||
# breaks = breaks, xlim = xlim, freq = FALSE)
|
||||
# lines(density(d2), col = 2)
|
||||
# abline(v = range(d2), lty = 2)
|
||||
# hist(d3, main = expression(paste(GD, (B - hat(B))[i])),
|
||||
# breaks = breaks, xlim = xlim, freq = FALSE)
|
||||
# lines(density(d3), col = 3)
|
||||
# abline(v = range(d3), lty = 2)
|
||||
# hist(d4, main = expression(paste(GD + Nest, (B - hat(B))[i])),
|
||||
# breaks = breaks, xlim = xlim, freq = FALSE)
|
||||
# lines(density(d4), col = 4)
|
||||
# abline(v = range(d4), lty = 2)
|
||||
# })
|
||||
# dev.print(png, file = "hist.png", width = 768, height = 768, res = 125)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# options(width = 300)
|
||||
# print(pr <- prof.tree::prof.tree("./Rprof.out"), limit = NULL
|
||||
# , pruneFun = function(x) x$percent > 0.01)
|
||||
|
||||
# par(mfrow = c(2, 2))
|
||||
# matrixImage(alpha, main = "alpha")
|
||||
# matrixImage(fit$alpha, main = "fit$alpha")
|
||||
# matrixImage(beta, main = "beta")
|
||||
# matrixImage(fit$beta, main = "fit$beta")
|
||||
|
||||
# if (diff(dim(alpha) * dim(beta)) > 0) {
|
||||
# par(mfrow = c(2, 1))
|
||||
# } else {
|
||||
# par(mfrow = c(1, 2))
|
||||
# }
|
||||
# matrixImage(kronecker(alpha, beta), main = "kronecker(alpha, beta)")
|
||||
# matrixImage(kronecker(fit$alpha, fit$beta), main = "kronecker(fit$alpha, fit$beta)")
|
||||
|
||||
# matrixImage(Delta, main = "Delta")
|
||||
# matrixImage(fit$Delta, main = "fit$Delta")
|
||||
|
||||
|
||||
# local({
|
||||
# a <- (-1 * (sum(sign(fit$alpha) * sign(alpha)) < 0)) * fit$alpha / mean(fit$alpha^2)
|
||||
# b <- alpha / mean(alpha^2)
|
||||
|
||||
# norm(a - b, "F")
|
||||
# })
|
||||
# local({
|
||||
# a <- (-1 * (sum(sign(fit$beta) * sign(beta)) < 0)) * fit$beta / mean(fit$beta^2)
|
||||
# b <- beta / mean(beta^2)
|
||||
|
||||
# norm(a - b, "F")
|
||||
# })
|
||||
|
||||
|
||||
# # Which Sequence?
|
||||
# x <- y <- 1
|
||||
# replicate(40, x <<- (y <<- x + y) - x)
|
||||
|
||||
|
||||
# # Face-Splitting Product
|
||||
# n <- 100
|
||||
# p <- 3
|
||||
# q <- 500
|
||||
# A <- matrix(rnorm(n * p), n)
|
||||
# B <- matrix(rnorm(n * q), n)
|
||||
|
||||
# faceSplit <- function(A, B) {
|
||||
# C <- vapply(seq_len(ncol(A)), function(i) A[, i] * B, B)
|
||||
# dim(C) <- c(nrow(A), ncol(A) * ncol(B))
|
||||
# C
|
||||
# }
|
||||
|
||||
# all.equal(
|
||||
# tkhatriRao(A, B),
|
||||
# faceSplit(A, B)
|
||||
# )
|
||||
# microbenchmark::microbenchmark(
|
||||
# tkhatriRao(A, B),
|
||||
# faceSplit(A, B)
|
||||
# )
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# dist.kron <- function(a0, b0, a1, b1) {
|
||||
# sqrt(sum(a0^2) * sum(b0^2) -
|
||||
# 2 * sum(diag(crossprod(a0, a1))) * sum(diag(crossprod(b0, b1))) +
|
||||
# sum(a1^2) * sum(b1^2))
|
||||
# }
|
||||
|
||||
|
||||
# alpha <- matrix(rnorm(q * r), q)
|
||||
# beta <- matrix(rnorm(p * k), p)
|
||||
# alpha.true <- matrix(rnorm(q * r), q)
|
||||
# beta.true <- matrix(rnorm(p * k), p)
|
||||
# all.equal(
|
||||
# dist.kron(alpha, beta, alpha.true, beta.true),
|
||||
# norm(kronecker(alpha, beta) - kronecker(alpha.true, beta.true), "F")
|
||||
# )
|
||||
|
||||
# A <- matrix(rnorm(p^2), p)
|
||||
# B <- matrix(rnorm(p^2), p)
|
||||
|
||||
# tr <- function(A) sum(diag(A))
|
||||
# tr(crossprod(A, B))
|
||||
# tr(tcrossprod(B, A))
|
||||
|
||||
# tr(crossprod(A, A))
|
||||
# tr(tcrossprod(A, A))
|
||||
# sum(A^2)
|
||||
|
||||
# alpha <- matrix(rnorm(q * r), q)
|
||||
# beta <- matrix(rnorm(p * k), p)
|
||||
|
||||
# norm(kronecker(alpha, beta), "F")^2
|
||||
# norm(alpha, "F")^2 * norm(beta, "F")^2
|
||||
# tr(crossprod(kronecker(alpha, beta)))
|
||||
# tr(tcrossprod(kronecker(alpha, beta)))
|
||||
# tr(crossprod(kronecker(t(alpha), t(beta))))
|
||||
# tr(crossprod(alpha)) * tr(crossprod(beta))
|
||||
# tr(tcrossprod(alpha)) * tr(tcrossprod(beta))
|
||||
# tr(crossprod(alpha)) * tr(tcrossprod(beta))
|
||||
# sum(alpha^2) * sum(beta^2)
|
||||
|
||||
# alpha <- matrix(rnorm(q * r), q)
|
||||
# beta <- matrix(rnorm(p * k), p)
|
||||
# alpha.true <- matrix(rnorm(q * r), q)
|
||||
# beta.true <- matrix(rnorm(p * k), p)
|
||||
# microbenchmark::microbenchmark(
|
||||
# norm(kronecker(alpha, beta), "F")^2,
|
||||
# norm(alpha, "F")^2 * norm(beta, "F")^2,
|
||||
# tr(crossprod(kronecker(alpha, beta))),
|
||||
# tr(tcrossprod(kronecker(alpha, beta))),
|
||||
# tr(crossprod(kronecker(t(alpha), t(beta)))),
|
||||
# tr(crossprod(alpha)) * tr(crossprod(beta)),
|
||||
# tr(tcrossprod(alpha)) * tr(tcrossprod(beta)),
|
||||
# tr(crossprod(alpha)) * tr(tcrossprod(beta)),
|
||||
# sum(alpha^2) * sum(beta^2),
|
||||
|
||||
# setup = {
|
||||
# p <- sample(1:10, 1)
|
||||
# q <- sample(1:10, 1)
|
||||
# k <- sample(1:10, 1)
|
||||
# r <- sample(1:10, 1)
|
||||
# assign("alpha", matrix(rnorm(q * r), q), .GlobalEnv)
|
||||
# assign("beta", matrix(rnorm(p * k), p), .GlobalEnv)
|
||||
# assign("alpha.true", matrix(rnorm(q * r), q), .GlobalEnv)
|
||||
# assign("beta.true", matrix(rnorm(p * k), p), .GlobalEnv)
|
||||
# }
|
||||
# )
|
||||
|
||||
|
||||
|
||||
# p <- sample(1:15, 1) # 11
|
||||
# q <- sample(1:15, 1) # 3
|
||||
# k <- sample(1:15, 1) # 7
|
||||
# r <- sample(1:15, 1) # 5
|
||||
# A <- matrix(rnorm(q * r), q)
|
||||
# B <- matrix(rnorm(p * k), p)
|
||||
# a <- matrix(rnorm(q * r), q)
|
||||
# b <- matrix(rnorm(p * k), p)
|
||||
|
||||
# all.equal(
|
||||
# kronecker(A + a, B + b),
|
||||
# kronecker(A, B) + kronecker(A, b) + kronecker(a, B) + kronecker(a, b)
|
||||
# )
|
||||
|
||||
|
||||
# p <- 200L
|
||||
# n <- 100L
|
||||
# R <- matrix(rnorm(n * p), n)
|
||||
# A <- matrix(rnorm(p^2), p) # Base Matrix
|
||||
# B <- A + 0.01 * matrix(rnorm(p^2), p) # Distortion / Update of A
|
||||
# A.inv <- solve(A)
|
||||
|
||||
# microbenchmark::microbenchmark(
|
||||
# solve = R %*% solve(B),
|
||||
# neumann.raw = R %*% (A.inv + A.inv %*% (A - B) %*% A.inv + A.inv %*% (A - B) %*% A.inv %*% (A - B) %*% A.inv),
|
||||
# neumann.fun = {
|
||||
# AD <- A.inv %*% (A - B)
|
||||
# res <- A.inv + AD %*% A.inv
|
||||
# res <- A.inv + AD %*% res
|
||||
# R %*% res
|
||||
# }
|
||||
# )
|
||||
|
||||
# all.equal(
|
||||
# A.inv + A.inv %*% (A - B) %*% A.inv + A.inv %*% (A - B) %*% A.inv %*% (A - B) %*% A.inv,
|
||||
# {
|
||||
# DA <- (A - B) %*% A.inv
|
||||
# res <- A.inv + A.inv %*% DA
|
||||
# res <- A.inv + res %*% DA
|
||||
# res
|
||||
# }
|
||||
# )
|
||||
|
||||
# all.equal(
|
||||
# A.inv + A.inv %*% (A - B) %*% A.inv + A.inv %*% (A - B) %*% A.inv %*% (A - B) %*% A.inv,
|
||||
# {
|
||||
# AD <- A.inv %*% (A - B)
|
||||
# res <- A.inv + AD %*% A.inv
|
||||
# res <- A.inv + AD %*% res
|
||||
# res
|
||||
# }
|
||||
# )
|
||||
|
||||
# #####
|
||||
# sym <- function(A) A + t(A)
|
||||
# n <- 101
|
||||
# p <- 7
|
||||
# q <- 11
|
||||
# r <- 3
|
||||
# k <- 5
|
||||
# R <- array(rnorm(n * p * q), dim = c(n = n, p = p, q = q))
|
||||
# F <- array(rnorm(n * k * r), dim = c(n = n, k = k, r = r))
|
||||
# alpha <- array(rnorm(q * r), dim = c(q = q, r = r))
|
||||
# beta <- array(rnorm(p * k), dim = c(p = p, k = k))
|
||||
# Delta.1 <- sym(matrix(rnorm(q * q), q, q))
|
||||
# dim(Delta.1) <- c(q = q, q = q)
|
||||
# Delta.2 <- sym(matrix(rnorm(p * p), p, p))
|
||||
# dim(Delta.2) <- c(p = p, p = p)
|
||||
|
||||
# Delta <- kronecker(Delta.1, Delta.2)
|
||||
|
||||
# grad.alpha.1 <- local({
|
||||
# Di.1 <- solve(Delta.1)
|
||||
# Di.2 <- solve(Delta.2)
|
||||
# .R <- sapply(seq_len(n), function(i) tcrossprod(Di.1, R[i, , ]) %*% Di.2)
|
||||
# dim(.R) <- c(q, p, n)
|
||||
# .F <- sapply(seq_len(n), function(i) beta %*% F[i, , ])
|
||||
# dim(.F) <- c(p, r, n)
|
||||
|
||||
# .C <- sapply(seq_len(n), function(i) .R[i, , ] %*% .F[i, , ])
|
||||
# dim(.C) <- c(n, q, r)
|
||||
# colSums(.C)
|
||||
# })
|
||||
# grad.alpha.2 <- local({
|
||||
# # Delta.1^-1 R' Delta.2^-1
|
||||
# .R <- aperm(R, c(2, 1, 3))
|
||||
# dim(.R) <- c(q, n * p)
|
||||
# .R <- solve(Delta.1) %*% .R
|
||||
# dim(.R) <- c(q, n, p)
|
||||
# .R <- aperm(.R, c(3, 2, 1))
|
||||
# dim(.R) <- c(p, n * q)
|
||||
# .R <- solve(Delta.2) %*% .R
|
||||
# dim(.R) <- c(p, n, q)
|
||||
# .R <- aperm(.R, c(2, 3, 1)) # n x q x p
|
||||
|
||||
# # beta F
|
||||
# .F <- aperm(F, c(2, 1, 3))
|
||||
# dim(.F) <- c(k, n * r)
|
||||
# .F <- beta %*% .F
|
||||
# dim(.F) <- c(p, n, r)
|
||||
# .F <- aperm(.F, c(2, 1, 3)) # n x p x r
|
||||
|
||||
# # (Delta.1^-1 R' Delta.2^-1) (beta F)
|
||||
# .R <- aperm(.R, c(1, 3, 2))
|
||||
# dim(.R) <- c(n * p, q)
|
||||
# dim(.F) <- c(n * p, r)
|
||||
# crossprod(.R, .F)
|
||||
# })
|
||||
|
||||
# all.equal(
|
||||
# grad.alpha.1,
|
||||
# grad.alpha.2
|
||||
# )
|
||||
|
||||
# all.equal({
|
||||
# .R <- matrix(0, q, p)
|
||||
# Di.1 <- solve(Delta.1)
|
||||
# Di.2 <- solve(Delta.2)
|
||||
# for (i in 1:n) {
|
||||
# .R <- .R + Di.1 %*% t(R[i, , ]) %*% Di.2
|
||||
# }
|
||||
# .R
|
||||
# }, {
|
||||
# .R <- R
|
||||
# Di.1 <- solve(Delta.1)
|
||||
# Di.2 <- solve(Delta.2)
|
||||
# .R <- sapply(seq_len(n), function(i) tcrossprod(Di.1, R[i, , ]) %*% Di.2)
|
||||
# dim(.R) <- c(q, p, n)
|
||||
# .R <- aperm(.R, c(3, 1, 2))
|
||||
# colSums(.R)
|
||||
# })
|
||||
# all.equal({
|
||||
# Di.1 <- solve(Delta.1)
|
||||
# Di.2 <- solve(Delta.2)
|
||||
# .R <- sapply(seq_len(n), function(i) tcrossprod(Di.1, R[i, , ]) %*% Di.2)
|
||||
# dim(.R) <- c(q, p, n)
|
||||
# .R <- aperm(.R, c(3, 1, 2))
|
||||
# .R
|
||||
# }, {
|
||||
# .R <- R
|
||||
# dim(.R) <- c(n * p, q)
|
||||
# .R <- .R %*% solve(Delta.1)
|
||||
# dim(.R) <- c(n, p, q)
|
||||
# .R <- aperm(.R, c(1, 3, 2))
|
||||
# dim(.R) <- c(n * q, p)
|
||||
# .R <- .R %*% solve(Delta.2)
|
||||
# dim(.R) <- c(n, q, p)
|
||||
# .R
|
||||
# })
|
||||
# all.equal({
|
||||
# .F <- matrix(0, p, r)
|
||||
# for (i in 1:n) {
|
||||
# .F <- .F + beta %*% F[i, , ]
|
||||
# }
|
||||
# .F
|
||||
# }, {
|
||||
# .F <- apply(F, 1, function(Fi) beta %*% Fi)
|
||||
# dim(.F) <- c(p, r, n)
|
||||
# .F <- aperm(.F, c(3, 1, 2))
|
||||
# colSums(.F)
|
||||
# })
|
||||
# all.equal({
|
||||
# .F <- apply(F, 1, function(Fi) beta %*% Fi)
|
||||
# dim(.F) <- c(p, r, n)
|
||||
# .F <- aperm(.F, c(3, 1, 2))
|
||||
# colSums(.F)
|
||||
# }, {
|
||||
# .F <- aperm(F, c(1, 3, 2))
|
||||
# dim(.F) <- c(n * r, k)
|
||||
# .F <- tcrossprod(.F, beta)
|
||||
# dim(.F) <- c(n, r, p)
|
||||
# t(colSums(.F))
|
||||
# })
|
||||
|
||||
# all.equal({
|
||||
# Di.1 <- solve(Delta.1)
|
||||
# Di.2 <- solve(Delta.2)
|
||||
# grad.alpha <- 0
|
||||
# grad.beta <- 0
|
||||
# dim(R) <- c(n, p, q)
|
||||
# dim(F) <- c(n, k, r)
|
||||
# for (i in 1:n) {
|
||||
# grad.alpha <- grad.alpha + (
|
||||
# Di.1 %*% t(R[i, , ]) %*% Di.2 %*% beta %*% F[i, , ]
|
||||
# )
|
||||
# grad.beta <- grad.beta + (
|
||||
# Di.2 %*% R[i, , ] %*% Di.1 %*% alpha %*% t(F[i, , ])
|
||||
# )
|
||||
# }
|
||||
|
||||
# g1 <- c(dim(grad.alpha), dim(grad.beta), grad.alpha, grad.beta)
|
||||
# }, {
|
||||
# # Note that the order is important since for grad.beta the residuals do NOT
|
||||
# # need to be transposes.
|
||||
|
||||
# # left/right standardized residuals Delta_1^-1 R_i' Delta_2^-1 for i in 1:n
|
||||
# dim(R) <- c(n * p, q)
|
||||
# .R <- R %*% solve(Delta.1)
|
||||
# dim(.R) <- c(n, p, q)
|
||||
# .R <- aperm(.R, c(1, 3, 2))
|
||||
# dim(.R) <- c(n * q, p)
|
||||
# .R <- .R %*% solve(Delta.2)
|
||||
# dim(.R) <- c(n, q, p)
|
||||
|
||||
# # gradient with respect to beta
|
||||
# # Responces times beta (alpha f_i')
|
||||
# dim(F) <- c(n * k, r)
|
||||
# .F <- tcrossprod(F, alpha)
|
||||
# dim(.F) <- c(n, k, q)
|
||||
# .F <- aperm(.F, c(1, 3, 2))
|
||||
# # Matricize
|
||||
# dim(.R) <- c(n * q, p)
|
||||
# dim(.F) <- c(n * q, k)
|
||||
# grad.beta <- crossprod(.R, .F)
|
||||
|
||||
# # gradient with respect to beta
|
||||
# # Responces times alpha
|
||||
# dim(F) <- c(n, k, r)
|
||||
# .F <- aperm(F, c(1, 3, 2))
|
||||
# dim(.F) <- c(n * r, k)
|
||||
# .F <- tcrossprod(.F, beta)
|
||||
# dim(.F) <- c(n, r, p)
|
||||
# .F <- aperm(.F, c(1, 3, 2))
|
||||
# # Transpose stand. residuals
|
||||
# dim(.R) <- c(n, q, p)
|
||||
# .R <- aperm(.R, c(1, 3, 2))
|
||||
# # Matricize
|
||||
# dim(.R) <- c(n * p, q)
|
||||
# dim(.F) <- c(n * p, r)
|
||||
# grad.alpha <- crossprod(.R, .F)
|
||||
|
||||
# g2 <- c(dim(grad.alpha), dim(grad.beta), grad.alpha, grad.beta)
|
||||
# })
|
||||
|
||||
|
||||
# microbenchmark::microbenchmark(R1 = {
|
||||
# Di.1 <- solve(Delta.1)
|
||||
# Di.2 <- solve(Delta.2)
|
||||
# grad.alpha <- 0 # matrix(0, q, r)
|
||||
# grad.beta <- 0 # matrix(0, p, k)
|
||||
# dim(R) <- c(n, p, q)
|
||||
# dim(F) <- c(n, k, r)
|
||||
# for (i in 1:n) {
|
||||
# grad.alpha <- grad.alpha + (
|
||||
# Di.1 %*% t(R[i, , ]) %*% Di.2 %*% beta %*% F[i, , ]
|
||||
# )
|
||||
# grad.beta <- grad.beta + (
|
||||
# Di.2 %*% R[i, , ] %*% Di.1 %*% alpha %*% t(F[i, , ])
|
||||
# )
|
||||
# }
|
||||
|
||||
# g1 <- c(dim(grad.alpha), dim(grad.beta), grad.alpha, grad.beta)
|
||||
# }, R3 = {
|
||||
# # Note that the order is important since for grad.beta the residuals do NOT
|
||||
# # need to be transposes.
|
||||
|
||||
# # left/right standardized residuals Delta_1^-1 R_i' Delta_2^-1 for i in 1:n
|
||||
# dim(R) <- c(n * p, q)
|
||||
# .R <- R %*% solve(Delta.1)
|
||||
# dim(.R) <- c(n, p, q)
|
||||
# .R <- aperm(.R, c(1, 3, 2))
|
||||
# dim(.R) <- c(n * q, p)
|
||||
# .R <- .R %*% solve(Delta.2)
|
||||
# dim(.R) <- c(n, q, p)
|
||||
|
||||
# # gradient with respect to beta
|
||||
# # Responces times beta (alpha f_i')
|
||||
# dim(F) <- c(n * k, r)
|
||||
# .F <- tcrossprod(F, alpha)
|
||||
# dim(.F) <- c(n, k, q)
|
||||
# .F <- aperm(.F, c(1, 3, 2))
|
||||
# # Matricize
|
||||
# dim(.R) <- c(n * q, p)
|
||||
# dim(.F) <- c(n * q, k)
|
||||
# grad.beta <- crossprod(.R, .F)
|
||||
|
||||
# # gradient with respect to beta
|
||||
# # Responces times alpha
|
||||
# dim(F) <- c(n, k, r)
|
||||
# .F <- aperm(F, c(1, 3, 2))
|
||||
# dim(.F) <- c(n * r, k)
|
||||
# .F <- tcrossprod(.F, beta)
|
||||
# dim(.F) <- c(n, r, p)
|
||||
# .F <- aperm(.F, c(1, 3, 2))
|
||||
# # Transpose stand. residuals
|
||||
# dim(.R) <- c(n, q, p)
|
||||
# .R <- aperm(.R, c(1, 3, 2))
|
||||
# # Matricize
|
||||
# dim(.R) <- c(n * p, q)
|
||||
# dim(.F) <- c(n * p, r)
|
||||
# grad.alpha <- crossprod(.R, .F)
|
||||
|
||||
# g2 <- c(dim(grad.alpha), dim(grad.beta), grad.alpha, grad.beta)
|
||||
# })
|
||||
|
||||
|
||||
# n <- 100
|
||||
# p <- 7
|
||||
# q <- 11
|
||||
# k <- 3
|
||||
# r <- 5
|
||||
|
||||
# X <- array(rnorm(n * p * q), dim = c(n = n, p = p, q = q))
|
||||
# F <- array(rnorm(n * k * r), dim = c(n = n, k = k, r = r))
|
||||
# alpha <- array(rnorm(q * r), dim = c(q = q, r = r))
|
||||
# beta <- array(rnorm(p * k), dim = c(p = p, k = k))
|
||||
|
||||
# all.equal({
|
||||
# R <- array(NA, dim = c(n, p, q))
|
||||
# for (i in 1:n) {
|
||||
# R[i, , ] <- X[i, , ] - beta %*% F[i, , ] %*% t(alpha)
|
||||
# }
|
||||
# R
|
||||
# }, {
|
||||
# X - (F %x_3% alpha %x_2% beta)
|
||||
# }, check.attributes = FALSE)
|
||||
|
||||
# microbenchmark::microbenchmark(base = {
|
||||
# R <- array(NA, dim = c(n, p, q))
|
||||
# for (i in 1:n) {
|
||||
# R[i, , ] <- X[i, , ] - beta %*% F[i, , ] %*% t(alpha)
|
||||
# }
|
||||
# R
|
||||
# }, ttm = {
|
||||
# X - (F %x_3% alpha %x_2% beta)
|
||||
# })
|
||||
|
||||
|
||||
|
||||
# n <- 100; p <- 7; q <- 11; k <- 3; r <- 5
|
||||
# sym <- function(x) t(x) + x
|
||||
# Di.1 <- sym(matrix(rnorm(q^2), q, q))
|
||||
# Di.2 <- sym(matrix(rnorm(p^2), p, p))
|
||||
# R <- array(rnorm(n, p, q), dim = c(n, p, q))
|
||||
# F <- array(rnorm(n, k, r), dim = c(n, k, r))
|
||||
# alpha <- matrix(rnorm(q * r), q, r)
|
||||
# beta <- matrix(rnorm(p * k), p, k)
|
||||
|
||||
# all.equal({
|
||||
# .R <- array(NA, dim = c(n, p, q))
|
||||
# for (i in 1:n) {
|
||||
# .R[i, , ] <- Di.2 %*% R[i, , ] %*% Di.1
|
||||
# }
|
||||
# .R
|
||||
# }, {
|
||||
# R %x_3% Di.1 %x_2% Di.2
|
||||
# })
|
||||
# all.equal({
|
||||
# .Rt <- array(NA, dim = c(n, q, p))
|
||||
# for (i in 1:n) {
|
||||
# .Rt[i, , ] <- Di.1 %*% t(R[i, , ]) %*% Di.2
|
||||
# }
|
||||
# .Rt
|
||||
# }, {
|
||||
# .Rt <- R %x_3% Di.1 %x_2% Di.2
|
||||
# aperm(.Rt, c(1, 3, 2))
|
||||
# })
|
||||
# all.equal({
|
||||
# .Fa <- array(NA, dim = c(n, q, k))
|
||||
# for (i in 1:n) {
|
||||
# .Fa[i, , ] <- alpha %*% t(F[i, , ])
|
||||
# }
|
||||
# .Fa
|
||||
# }, {
|
||||
# aperm(F %x_3% alpha, c(1, 3, 2))
|
||||
# })
|
||||
# all.equal({
|
||||
# .Fb <- array(NA, dim = c(n, p, r))
|
||||
# for (i in 1:n) {
|
||||
# .Fb[i, , ] <- beta %*% F[i, , ]
|
||||
# }
|
||||
# .Fb
|
||||
# }, {
|
||||
# F %x_2% beta
|
||||
# })
|
||||
# all.equal({
|
||||
# .F <- array(NA, dim = c(n, p, q))
|
||||
# for (i in 1:n) {
|
||||
# .F[i, , ] <- beta %*% F[i, , ] %*% t(alpha)
|
||||
# }
|
||||
# .F
|
||||
# }, {
|
||||
# F %x_3% alpha %x_2% beta
|
||||
# })
|
||||
# all.equal({
|
||||
# .Ga <- 0
|
||||
# for (i in 1:n) {
|
||||
# .Ga <- .Ga + Di.1 %*% t(R[i, , ]) %*% Di.2 %*% beta %*% F[i, , ]
|
||||
# }
|
||||
# .Ga
|
||||
# }, {
|
||||
# .R <- R %x_3% Di.1 %x_2% Di.2
|
||||
# dim(.R) <- c(n * p, q)
|
||||
# .Fb <- F %x_2% beta
|
||||
# dim(.Fb) <- c(n * p, r)
|
||||
# crossprod(.R, .Fb)
|
||||
# })
|
||||
# all.equal({
|
||||
# .Gb <- 0
|
||||
# for (i in 1:n) {
|
||||
# .Gb <- .Gb + Di.2 %*% R[i, , ] %*% Di.1 %*% alpha %*% t(F[i, , ])
|
||||
# }
|
||||
# .Gb
|
||||
# }, {
|
||||
# .Rt <- aperm(R %x_3% Di.1 %x_2% Di.2, c(1, 3, 2))
|
||||
# dim(.Rt) <- c(n * q, p)
|
||||
# .Fa <- aperm(F %x_3% alpha, c(1, 3, 2))
|
||||
# dim(.Fa) <- c(n * q, k)
|
||||
# crossprod(.Rt, .Fa)
|
||||
# })
|
||||
# all.equal({
|
||||
# .Ga <- 0
|
||||
# for (i in 1:n) {
|
||||
# .Ga <- .Ga + Di.1 %*% t(R[i, , ]) %*% Di.2 %*% beta %*% F[i, , ]
|
||||
# }
|
||||
# .Gb <- 0
|
||||
# for (i in 1:n) {
|
||||
# .Gb <- .Gb + Di.2 %*% R[i, , ] %*% Di.1 %*% alpha %*% t(F[i, , ])
|
||||
# }
|
||||
# c(.Ga, .Gb)
|
||||
# }, {
|
||||
# .R <- R %x_3% Di.1 %x_2% Di.2
|
||||
# .Fb <- F %x_2% beta
|
||||
# .Fa <- aperm(F %x_3% alpha, c(1, 3, 2))
|
||||
|
||||
# dim(.R) <- c(n * p, q)
|
||||
# dim(.Fb) <- c(n * p, r)
|
||||
# .Ga <- crossprod(.R, .Fb)
|
||||
|
||||
# dim(.R) <- c(n, p, q)
|
||||
# .R <- aperm(.R, c(1, 3, 2))
|
||||
# dim(.R) <- c(n * q, p)
|
||||
# dim(.Fa) <- c(n * q, k)
|
||||
# .Gb <- crossprod(.R, .Fa)
|
||||
|
||||
# c(.Ga, .Gb)
|
||||
# })
|
||||
# all.equal({
|
||||
# .Ga <- 0
|
||||
# for (i in 1:n) {
|
||||
# .Ga <- .Ga + Di.1 %*% t(R[i, , ]) %*% Di.2 %*% beta %*% F[i, , ]
|
||||
# }
|
||||
# .Gb <- 0
|
||||
# for (i in 1:n) {
|
||||
# .Gb <- .Gb + Di.2 %*% R[i, , ] %*% Di.1 %*% alpha %*% t(F[i, , ])
|
||||
# }
|
||||
# c(.Ga, .Gb)
|
||||
# }, {
|
||||
# .R <- R %x_3% Di.1 %x_2% Di.2
|
||||
|
||||
# .Ga <- tcrossprod(mat(.R, 3), mat(F %x_2% beta, 3))
|
||||
# .Gb <- tcrossprod(mat(.R, 2), mat(F %x_3% alpha, 2))
|
||||
|
||||
# c(.Ga, .Gb)
|
||||
# })
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# n <- 101; p <- 5; q <- 7
|
||||
|
||||
# sym <- function(x) crossprod(x)
|
||||
# D1 <- sym(matrix(rnorm(q^2), q, q))
|
||||
# D2 <- sym(matrix(rnorm(p^2), p, p))
|
||||
|
||||
# X <- tensorPredictors:::rmvnorm(n, sigma = kronecker(D1, D2))
|
||||
# dim(X) <- c(n, p, q)
|
||||
|
||||
# D1.hat <- tcrossprod(mat(X, 3)) / n
|
||||
# D2.hat <- tcrossprod(mat(X, 2)) / n
|
||||
|
||||
# local({
|
||||
# par(mfrow = c(2, 2))
|
||||
# matrixImage(D1, main = "D1")
|
||||
# matrixImage(D1.hat, main = "D1.hat")
|
||||
# matrixImage(D2, main = "D2")
|
||||
# matrixImage(D2.hat, main = "D2.hat")
|
||||
# })
|
||||
|
||||
# sum(X^2) / n
|
||||
# sum(diag(D1.hat))
|
||||
# sum(diag(D2.hat))
|
||||
# sum(diag(kronecker(D1, D2)))
|
||||
# sum(diag(kronecker(D1.hat / sqrt(sum(diag(D1.hat))),
|
||||
# D2.hat / sqrt(sum(diag(D1.hat))))))
|
||||
|
||||
# all.equal({
|
||||
# mat(X, 1) %*% kronecker(D1.hat, D2.hat)
|
||||
# }, {
|
||||
# mat(X %x_3% D1.hat %x_2% D2.hat, 1)
|
||||
# })
|
||||
# all.equal({
|
||||
# C <- mat(X, 1) %*% kronecker(D1.hat, D2.hat) * (n / sum(X^2))
|
||||
# dim(C) <- c(n, p, q)
|
||||
# C
|
||||
# }, {
|
||||
# (X %x_3% D1.hat %x_2% D2.hat) / sum(diag(D1.hat))
|
||||
# })
|
||||
|
||||
|
||||
|
||||
|
||||
# D.1 <- tcrossprod(mat(X, 3))
|
||||
# D.2 <- tcrossprod(mat(X, 2))
|
||||
# tr <- sum(diag(D.1))
|
||||
# D.1 <- D.1 / sqrt(n * tr)
|
||||
# D.2 <- D.2 / sqrt(n * tr)
|
||||
|
||||
# sum(diag(kronecker(D1, D2)))
|
||||
# sum(diag(kronecker(D.1, D.2)))
|
||||
# det(kronecker(D1, D2))
|
||||
# det(kronecker(D.1, D.2))
|
||||
# det(D.1)^p * det(D.2)^q
|
||||
|
||||
# log(det(kronecker(D.1, D.2)))
|
||||
# p * log(det(D.1)) + q * log(det(D.2))
|
|
@ -1,16 +1,28 @@
|
|||
# Generated by roxygen2: do not edit by hand
|
||||
|
||||
export("%x_1%")
|
||||
export("%x_2%")
|
||||
export("%x_3%")
|
||||
export("%x_4%")
|
||||
export(CISE)
|
||||
export(LSIR)
|
||||
export(PCA2d)
|
||||
export(POI)
|
||||
export(RMReg)
|
||||
export(approx.kronecker)
|
||||
export(colKronecker)
|
||||
export(dist.projection)
|
||||
export(dist.subspace)
|
||||
export(kpir.base)
|
||||
export(kpir.kron)
|
||||
export(kpir.momentum)
|
||||
export(kpir.new)
|
||||
export(mat)
|
||||
export(matpow)
|
||||
export(matrixImage)
|
||||
export(reduce)
|
||||
export(rowKronecker)
|
||||
export(tensor_predictor)
|
||||
export(ttm)
|
||||
import(stats)
|
||||
useDynLib(tensorPredictors, .registration = TRUE)
|
||||
|
|
|
@ -135,7 +135,7 @@ RMReg <- function(X, Z, y, lambda = 0, max.iter = 500L, max.line.iter = 50L,
|
|||
}
|
||||
|
||||
# Evaluate loss to ensure descent after line search
|
||||
loss.temp <- loss(B.temp, beta.temp, X, Z, y)
|
||||
loss.temp <- left # loss(B.temp, beta.temp, X, Z, y) # already computed
|
||||
|
||||
# logging callback
|
||||
if (is.function(logger)) {
|
||||
|
@ -196,7 +196,7 @@ RMReg <- function(X, Z, y, lambda = 0, max.iter = 500L, max.line.iter = 50L,
|
|||
iter = iter,
|
||||
df = df,
|
||||
loss = loss1,
|
||||
lambda = lambda,
|
||||
lambda = delta * lambda, #
|
||||
AIC = loss1 + 2 * df, # TODO: check this!
|
||||
BIC = loss1 + log(nrow(X)) * df, # TODO: check this!
|
||||
call = match.call() # invocing function call, collects params like lambda
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
#' Column wise kronecker product
|
||||
#'
|
||||
#' This is a special case of the "Khatri-Rao Product".
|
||||
#'
|
||||
#' @seealso \link{\code{rowKronecker}}
|
||||
#'
|
||||
#' @export
|
||||
colKronecker <- function(A, B) {
|
||||
sapply(1:ncol(A), function(i) kronecker(A[, i], B[, i]))
|
||||
}
|
||||
|
||||
#' Row wise kronecker product
|
||||
#'
|
||||
#' Also known as the "Face-splitting product". This is a special case of the
|
||||
#' "Khatri-Rao Product".
|
||||
#'
|
||||
#' @seealso \link{\code{colKronecker}}
|
||||
#'
|
||||
#' @export
|
||||
rowKronecker <- function(A, B) {
|
||||
C <- vapply(seq_len(ncol(A)), function(i) A[, i] * B, B)
|
||||
dim(C) <- c(nrow(A), ncol(A) * ncol(B))
|
||||
C
|
||||
}
|
|
@ -0,0 +1,136 @@
|
|||
#' Using unbiased (but not MLE) estimates for the Kronecker decomposed
|
||||
#' covariance matrices Delta_1, Delta_2 for approximating the log-likelihood
|
||||
#' giving a closed form solution for the gradient.
|
||||
#'
|
||||
#' Delta_1 = n^-1 sum_i R_i' R_i,
|
||||
#' Delta_2 = n^-1 sum_i R_i R_i'.
|
||||
#'
|
||||
#' @export
|
||||
kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
|
||||
nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)),
|
||||
eps = .Machine$double.eps,
|
||||
logger = NULL
|
||||
) {
|
||||
|
||||
# Check if X and Fy have same number of observations
|
||||
stopifnot(nrow(X) == NROW(Fy))
|
||||
n <- nrow(X) # Number of observations
|
||||
|
||||
# Get and check predictor dimensions
|
||||
if (length(dim(X)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(X) == prod(shape[1:2]))
|
||||
p <- as.integer(shape[1]) # Predictor "height"
|
||||
q <- as.integer(shape[2]) # Predictor "width"
|
||||
} else if (length(dim(X)) == 3L) {
|
||||
p <- dim(X)[2]
|
||||
q <- dim(X)[3]
|
||||
dim(X) <- c(n, p * q)
|
||||
} else {
|
||||
stop("'X' must be a matrix or 3-tensor")
|
||||
}
|
||||
|
||||
# Get and check response dimensions
|
||||
if (!is.array(Fy)) {
|
||||
Fy <- as.array(Fy)
|
||||
}
|
||||
if (length(dim(Fy)) == 1L) {
|
||||
k <- r <- 1L
|
||||
dim(Fy) <- c(n, 1L)
|
||||
} else if (length(dim(Fy)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(Fy) == prod(shape[3:4]))
|
||||
k <- as.integer(shape[3]) # Response functional "height"
|
||||
r <- as.integer(shape[4]) # Response functional "width"
|
||||
} else if (length(dim(Fy)) == 3L) {
|
||||
k <- dim(Fy)[2]
|
||||
r <- dim(Fy)[3]
|
||||
dim(Fy) <- c(n, k * r)
|
||||
} else {
|
||||
stop("'Fy' must be a vector, matrix or 3-tensor")
|
||||
}
|
||||
|
||||
|
||||
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
|
||||
# Vectorize
|
||||
dim(Fy) <- c(n, k * r)
|
||||
dim(X) <- c(n, p * q)
|
||||
# Solve
|
||||
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
|
||||
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
||||
# In case of under-determined system replace the inverse in the normal
|
||||
# equation by the Moore-Penrose Pseudo Inverse
|
||||
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
|
||||
} else {
|
||||
# Compute OLS estimate by the Normal Equation
|
||||
B <- t(solve(cpFy, crossprod(Fy, X)))
|
||||
}
|
||||
|
||||
# De-Vectroize (from now on tensor arithmetics)
|
||||
dim(Fy) <- c(n, k, r)
|
||||
dim(X) <- c(n, p, q)
|
||||
|
||||
# Decompose `B = alpha x beta` into `alpha` and `beta`
|
||||
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
|
||||
|
||||
# Compute residuals
|
||||
R <- X - (Fy %x_3% alpha0 %x_2% beta0)
|
||||
|
||||
# Covariance estimates and scaling factor
|
||||
Delta.1 <- tcrossprod(mat(R, 3))
|
||||
Delta.2 <- tcrossprod(mat(R, 2))
|
||||
s <- sum(diag(Delta.1))
|
||||
|
||||
# Inverse Covariances
|
||||
Delta.1.inv <- solve(Delta.1)
|
||||
Delta.2.inv <- solve(Delta.2)
|
||||
|
||||
# cross dependent covariance estimates
|
||||
S.1 <- n^-1 * tcrossprod(mat(R, 3), mat(R %x_2% Delta.2.inv, 3))
|
||||
S.2 <- n^-1 * tcrossprod(mat(R, 2), mat(R %x_3% Delta.1.inv, 2))
|
||||
|
||||
# Evaluate negative log-likelihood (2 pi term dropped)
|
||||
loss <- -0.5 * n * (p * q * log(s) - p * log(det(Delta.1)) -
|
||||
q * log(det(Delta.2)) - s * sum(S.1 * Delta.1.inv))
|
||||
|
||||
# Gradient "generating" tensor
|
||||
G <- (sum(S.1 * Delta.1.inv) - p * q / s) * R
|
||||
G <- G + R %x_2% ((diag(q, p, p) - s * (Delta.2.inv %*% S.2)) %*% Delta.2.inv)
|
||||
G <- G + R %x_3% ((diag(p, q, q) - s * (Delta.1.inv %*% S.1)) %*% Delta.1.inv)
|
||||
G <- G + s * (R %x_2% Delta.2.inv %x_3% Delta.1.inv)
|
||||
|
||||
# Call history callback (logger) before the first iteration
|
||||
if (is.function(logger)) {
|
||||
logger(0L, loss, alpha0, beta0, Delta.1, Delta.2, NA)
|
||||
}
|
||||
|
||||
|
||||
### Step 2: MLE estimate with LS solution as starting value
|
||||
a0 <- 0
|
||||
a1 <- 1
|
||||
alpha1 <- alpha0
|
||||
beta1 <- beta0
|
||||
|
||||
# main descent loop
|
||||
no.nesterov <- TRUE
|
||||
break.reason <- NA
|
||||
for (iter in seq_len(max.iter)) {
|
||||
if (no.nesterov) {
|
||||
# without extrapolation as fallback
|
||||
alpha.moment <- alpha1
|
||||
beta.moment <- beta1
|
||||
} else {
|
||||
# extrapolation using previous direction
|
||||
alpha.moment <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0)
|
||||
beta.moment <- beta1 + ((a0 - 1) / a1) * ( beta1 - beta0)
|
||||
}
|
||||
}
|
||||
|
||||
# Extrapolated residuals
|
||||
R <- X - (Fy %x_3% alpha.moment %x_2% beta.moment)
|
||||
|
||||
|
||||
|
||||
list(loss = loss, alpha = alpha1, beta = beta1, Delta = Delta)
|
||||
}
|
|
@ -0,0 +1,136 @@
|
|||
#' (Slightly altered) old implementation
|
||||
#'
|
||||
#' @export
|
||||
kpir.base <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||
method = c("mle", "ls"),
|
||||
eps1 = 1e-10, eps2 = 1e-10, max.iter = 500L,
|
||||
logger = NULL
|
||||
) {
|
||||
|
||||
# Check if X and Fy have same number of observations
|
||||
stopifnot(nrow(X) == NROW(Fy))
|
||||
n <- nrow(X) # Number of observations
|
||||
|
||||
# Get and check predictor dimensions
|
||||
if (length(dim(X)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(X) == prod(shape[1:2]))
|
||||
p <- as.integer(shape[1]) # Predictor "height"
|
||||
q <- as.integer(shape[2]) # Predictor "width"
|
||||
} else if (length(dim(X)) == 3L) {
|
||||
p <- dim(X)[2]
|
||||
q <- dim(X)[3]
|
||||
dim(X) <- c(n, p * q)
|
||||
} else {
|
||||
stop("'X' must be a matrix or 3-tensor")
|
||||
}
|
||||
|
||||
# Get and check response dimensions
|
||||
if (!is.array(Fy)) {
|
||||
Fy <- as.array(Fy)
|
||||
}
|
||||
if (length(dim(Fy)) == 1L) {
|
||||
k <- r <- 1L
|
||||
dim(Fy) <- c(n, 1L)
|
||||
} else if (length(dim(Fy)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(Fy) == prod(shape[3:4]))
|
||||
k <- as.integer(shape[3]) # Response functional "height"
|
||||
r <- as.integer(shape[4]) # Response functional "width"
|
||||
} else if (length(dim(Fy)) == 3L) {
|
||||
k <- dim(Fy)[2]
|
||||
r <- dim(Fy)[3]
|
||||
dim(Fy) <- c(n, k * r)
|
||||
} else {
|
||||
stop("'Fy' must be a vector, matrix or 3-tensor")
|
||||
}
|
||||
|
||||
log.likelihood <- function(par, X, Fy, Delta.inv, da, db) {
|
||||
alpha <- matrix(par[1:prod(da)], da[1L])
|
||||
beta <- matrix(par[(prod(da) + 1):length(par)], db[1L])
|
||||
error <- X - tcrossprod(Fy, kronecker(alpha, beta))
|
||||
sum(error * (error %*% Delta.inv))
|
||||
}
|
||||
|
||||
# Validate method using unexact matching.
|
||||
method <- match.arg(method)
|
||||
|
||||
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
|
||||
cpFy <- crossprod(Fy)
|
||||
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
||||
# In case of under-determined system replace the inverse in the normal
|
||||
# equation by the Moore-Penrose Pseudo Inverse
|
||||
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
|
||||
} else {
|
||||
# Compute OLS estimate by the Normal Equation
|
||||
B <- t(solve(cpFy, crossprod(Fy, X)))
|
||||
}
|
||||
|
||||
# Estimate alpha, beta as nearest kronecker approximation.
|
||||
c(alpha, beta) %<-% approx.kronecker(B, c(q, r), c(p, k))
|
||||
|
||||
if (method == "ls") {
|
||||
# Estimate Delta.
|
||||
B <- kronecker(alpha, beta)
|
||||
rank <- if (ncol(Fy) == 1) 1L else qr(Fy)$rank
|
||||
resid <- X - tcrossprod(Fy, B)
|
||||
Delta <- crossprod(resid) / (nrow(X) - rank)
|
||||
|
||||
} else { # mle
|
||||
B <- kronecker(alpha, beta)
|
||||
|
||||
# Compute residuals
|
||||
resid <- X - tcrossprod(Fy, B)
|
||||
|
||||
# Estimate initial Delta.
|
||||
Delta <- crossprod(resid) / nrow(X)
|
||||
|
||||
# call logger with initial starting value
|
||||
if (is.function(logger)) {
|
||||
# Transformed Residuals (using `matpow` as robust inversion algo,
|
||||
# uses Moore-Penrose Pseudo Inverse in case of singular `Delta`)
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
loss <- 0.5 * (nrow(X) * log(det(Delta)) + sum(resid.trans * resid))
|
||||
logger(0L, loss, alpha, beta, Delta, NA)
|
||||
}
|
||||
|
||||
for (iter in 1:max.iter) {
|
||||
# Optimize log-likelihood for alpha, beta with fixed Delta.
|
||||
opt <- optim(c(alpha, beta), log.likelihood, gr = NULL,
|
||||
X, Fy, matpow(Delta, -1), c(q, r), c(p, k))
|
||||
# Store previous alpha, beta and Delta (for break consition).
|
||||
Delta.last <- Delta
|
||||
B.last <- B
|
||||
# Extract optimized alpha, beta.
|
||||
alpha <- matrix(opt$par[1:(q * r)], q, r)
|
||||
beta <- matrix(opt$par[(q * r + 1):length(opt$par)], p, k)
|
||||
# Calc new Delta with likelihood optimized alpha, beta.
|
||||
B <- kronecker(alpha, beta)
|
||||
resid <- X - tcrossprod(Fy, B)
|
||||
Delta <- crossprod(resid) / nrow(X)
|
||||
|
||||
# call logger before break condition check
|
||||
if (is.function(logger)) {
|
||||
# Transformed Residuals (using `matpow` as robust inversion algo,
|
||||
# uses Moore-Penrose Pseudo Inverse in case of singular `Delta`)
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
loss <- 0.5 * (nrow(X) * log(det(Delta)) + sum(resid.trans * resid))
|
||||
logger(iter, loss, alpha, beta, Delta, NA)
|
||||
}
|
||||
|
||||
# Check break condition 1.
|
||||
if (norm(Delta - Delta.last, "F") < eps1 * norm(Delta, "F")) {
|
||||
# Check break condition 2.
|
||||
if (norm(B - B.last, "F") < eps2 * norm(B, "F")) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# calc final loss
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
loss <- 0.5 * (nrow(X) * log(det(Delta)) + sum(resid.trans * resid))
|
||||
|
||||
list(loss = loss, alpha = alpha, beta = beta, Delta = Delta)
|
||||
}
|
|
@ -0,0 +1,211 @@
|
|||
#' Gradient Descent Bases Tensor Predictors method with Nesterov Accelerated
|
||||
#' Momentum and Kronecker structure assumption for the residual covariance
|
||||
#' `Delta = Delta.1 %x% Delta.2` (simple plugin version!)
|
||||
#'
|
||||
#' @export
|
||||
kpir.kron <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
|
||||
nesterov.scaling = function(a, t) { 0.5 * (1 + sqrt(1 + (2 * a)^2)) },
|
||||
eps = .Machine$double.eps,
|
||||
logger = NULL
|
||||
) {
|
||||
|
||||
# Check if X and Fy have same number of observations
|
||||
stopifnot(nrow(X) == NROW(Fy))
|
||||
n <- nrow(X) # Number of observations
|
||||
|
||||
# Get and check predictor dimensions (convert to 3-tensor if needed)
|
||||
if (length(dim(X)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(X) == prod(shape[1:2]))
|
||||
p <- as.integer(shape[1]) # Predictor "height"
|
||||
q <- as.integer(shape[2]) # Predictor "width"
|
||||
} else if (length(dim(X)) == 3L) {
|
||||
p <- dim(X)[2]
|
||||
q <- dim(X)[3]
|
||||
} else {
|
||||
stop("'X' must be a matrix or 3-tensor")
|
||||
}
|
||||
|
||||
# Get and check response dimensions (and convert to 3-tensor if needed)
|
||||
if (!is.array(Fy)) {
|
||||
Fy <- as.array(Fy)
|
||||
}
|
||||
if (length(dim(Fy)) == 1L) {
|
||||
k <- r <- 1L
|
||||
dim(Fy) <- c(n, 1L, 1L)
|
||||
} else if (length(dim(Fy)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(Fy) == prod(shape[3:4]))
|
||||
k <- as.integer(shape[3]) # Response functional "height"
|
||||
r <- as.integer(shape[4]) # Response functional "width"
|
||||
} else if (length(dim(Fy)) == 3L) {
|
||||
k <- dim(Fy)[2]
|
||||
r <- dim(Fy)[3]
|
||||
} else {
|
||||
stop("'Fy' must be a vector, matrix or 3-tensor")
|
||||
}
|
||||
|
||||
|
||||
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
|
||||
# Vectorize
|
||||
dim(Fy) <- c(n, k * r)
|
||||
dim(X) <- c(n, p * q)
|
||||
# Solve
|
||||
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
|
||||
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
||||
# In case of under-determined system replace the inverse in the normal
|
||||
# equation by the Moore-Penrose Pseudo Inverse
|
||||
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
|
||||
} else {
|
||||
# Compute OLS estimate by the Normal Equation
|
||||
B <- t(solve(cpFy, crossprod(Fy, X)))
|
||||
}
|
||||
|
||||
# De-Vectroize (from now on tensor arithmetics)
|
||||
dim(Fy) <- c(n, k, r)
|
||||
dim(X) <- c(n, p, q)
|
||||
|
||||
# Decompose `B = alpha x beta` into `alpha` and `beta`
|
||||
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
|
||||
|
||||
# Compute residuals
|
||||
resid <- X - (Fy %x_3% alpha0 %x_2% beta0)
|
||||
|
||||
# Covariance estimate
|
||||
Delta.1 <- tcrossprod(mat(resid, 3))
|
||||
Delta.2 <- tcrossprod(mat(resid, 2))
|
||||
tr <- sum(diag(Delta.1))
|
||||
Delta.1 <- Delta.1 / sqrt(n * tr)
|
||||
Delta.2 <- Delta.2 / sqrt(n * tr)
|
||||
|
||||
# Transformed Residuals
|
||||
resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2)
|
||||
|
||||
# Evaluate negative log-likelihood
|
||||
loss <- 0.5 * (n * (p * log(det(Delta.1)) + q * log(det(Delta.2))) +
|
||||
sum(resid.trans * resid))
|
||||
|
||||
# Call history callback (logger) before the first iterate
|
||||
if (is.function(logger)) {
|
||||
logger(0L, loss, alpha0, beta0, Delta.1, Delta.2, NA)
|
||||
}
|
||||
|
||||
|
||||
### Step 2: MLE with LS solution as starting value
|
||||
a0 <- 0
|
||||
a1 <- 1
|
||||
alpha1 <- alpha0
|
||||
beta1 <- beta0
|
||||
|
||||
# main descent loop
|
||||
no.nesterov <- TRUE
|
||||
break.reason <- NA
|
||||
for (iter in seq_len(max.iter)) {
|
||||
if (no.nesterov) {
|
||||
# without extrapolation as fallback
|
||||
S.alpha <- alpha1
|
||||
S.beta <- beta1
|
||||
} else {
|
||||
# extrapolation using previous direction
|
||||
S.alpha <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0)
|
||||
S.beta <- beta1 + ((a0 - 1) / a1) * ( beta1 - beta0)
|
||||
}
|
||||
|
||||
# Extrapolated Residuals
|
||||
resid <- X - (Fy %x_3% S.alpha %x_2% S.beta)
|
||||
|
||||
# Covariance Estimates
|
||||
Delta.1 <- tcrossprod(mat(resid, 3))
|
||||
Delta.2 <- tcrossprod(mat(resid, 2))
|
||||
tr <- sum(diag(Delta.1))
|
||||
Delta.1 <- Delta.1 / sqrt(n * tr)
|
||||
Delta.2 <- Delta.2 / sqrt(n * tr)
|
||||
|
||||
# Transform Residuals
|
||||
resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2)
|
||||
|
||||
# Calculate Gradients
|
||||
grad.alpha <- tcrossprod(mat(resid.trans, 3), mat(Fy %x_2% beta, 3))
|
||||
grad.beta <- tcrossprod(mat(resid.trans, 2), mat(Fy %x_3% alpha, 2))
|
||||
|
||||
# Backtracking line search (Armijo type)
|
||||
# The `inner.prod` is used in the Armijo break condition but does not
|
||||
# depend on the step size.
|
||||
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
|
||||
|
||||
# backtracking loop
|
||||
for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) {
|
||||
# Update `alpha` and `beta` (note: add(+), the gradients are already
|
||||
# pointing into the negative slope direction of the loss cause they are
|
||||
# the gradients of the log-likelihood [NOT the negative log-likelihood])
|
||||
alpha.temp <- S.alpha + delta * grad.alpha
|
||||
beta.temp <- S.beta + delta * grad.beta
|
||||
|
||||
# Update Residuals, Covariance and transformed Residuals
|
||||
resid <- X - (Fy %x_3% alpha.temp %x_2% beta.temp)
|
||||
Delta.1 <- tcrossprod(mat(resid, 3))
|
||||
Delta.2 <- tcrossprod(mat(resid, 2))
|
||||
tr <- sum(diag(Delta.1))
|
||||
Delta.1 <- Delta.1 / sqrt(n * tr)
|
||||
Delta.2 <- Delta.2 / sqrt(n * tr)
|
||||
resid.trans <- resid %x_3% solve(Delta.1) %x_2% solve(Delta.2)
|
||||
|
||||
# Evaluate negative log-likelihood
|
||||
loss.temp <- 0.5 * (n * (p * log(det(Delta.1)) + q * log(det(Delta.2)))
|
||||
+ sum(resid.trans * resid))
|
||||
|
||||
# Armijo line search break condition
|
||||
if (loss.temp <= loss - 0.1 * delta * inner.prod) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
# Call logger (invoce history callback)
|
||||
if (is.function(logger)) {
|
||||
logger(iter, loss.temp, alpha.temp, beta.temp, Delta.1, Delta.2, delta)
|
||||
}
|
||||
|
||||
# Ensure descent
|
||||
if (loss.temp < loss) {
|
||||
alpha0 <- alpha1
|
||||
alpha1 <- alpha.temp
|
||||
beta0 <- beta1
|
||||
beta1 <- beta.temp
|
||||
|
||||
# check break conditions (in descent case)
|
||||
if (mean(abs(alpha1)) + mean(abs(beta1)) < eps) {
|
||||
break.reason <- "alpha, beta numerically zero"
|
||||
break # basically, estimates are zero -> stop
|
||||
}
|
||||
if (inner.prod < eps * (p * q + r * k)) {
|
||||
break.reason <- "mean squared gradient is smaller than epsilon"
|
||||
break # mean squared gradient is smaller than epsilon -> stop
|
||||
}
|
||||
if (abs(loss.temp - loss) < eps) {
|
||||
break.reason <- "decrease is too small (slow)"
|
||||
break # decrease is too small (slow) -> stop
|
||||
}
|
||||
|
||||
loss <- loss.temp
|
||||
no.nesterov <- FALSE # always reset
|
||||
} else if (!no.nesterov) {
|
||||
no.nesterov <- TRUE # retry without momentum
|
||||
next
|
||||
} else {
|
||||
break.reason <- "failed even without momentum"
|
||||
break # failed even without momentum -> stop
|
||||
}
|
||||
|
||||
# update momentum scaling
|
||||
a0 <- a1
|
||||
a1 <- nesterov.scaling(a1, iter)
|
||||
|
||||
# Set next iter starting step.size to line searched step size
|
||||
# (while allowing it to encrease)
|
||||
step.size <- 1.618034 * delta
|
||||
|
||||
}
|
||||
|
||||
list(loss = loss, alpha = alpha1, beta = beta1, Delta = Delta, break.reason = break.reason)
|
||||
}
|
|
@ -0,0 +1,192 @@
|
|||
#' Gradient Descent based Tensor Predictors method with Nesterov Accelerated
|
||||
#' Momentum
|
||||
#'
|
||||
#' @export
|
||||
kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
|
||||
nesterov.scaling = function(a, t) 0.5 * (1 + sqrt(1 + (2 * a)^2)),
|
||||
eps = .Machine$double.eps,
|
||||
logger = NULL
|
||||
) {
|
||||
|
||||
# Check if X and Fy have same number of observations
|
||||
stopifnot(nrow(X) == NROW(Fy))
|
||||
n <- nrow(X) # Number of observations
|
||||
|
||||
# Get and check predictor dimensions
|
||||
if (length(dim(X)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(X) == prod(shape[1:2]))
|
||||
p <- as.integer(shape[1]) # Predictor "height"
|
||||
q <- as.integer(shape[2]) # Predictor "width"
|
||||
} else if (length(dim(X)) == 3L) {
|
||||
p <- dim(X)[2]
|
||||
q <- dim(X)[3]
|
||||
dim(X) <- c(n, p * q)
|
||||
} else {
|
||||
stop("'X' must be a matrix or 3-tensor")
|
||||
}
|
||||
|
||||
# Get and check response dimensions
|
||||
if (!is.array(Fy)) {
|
||||
Fy <- as.array(Fy)
|
||||
}
|
||||
if (length(dim(Fy)) == 1L) {
|
||||
k <- r <- 1L
|
||||
dim(Fy) <- c(n, 1L)
|
||||
} else if (length(dim(Fy)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(Fy) == prod(shape[3:4]))
|
||||
k <- as.integer(shape[3]) # Response functional "height"
|
||||
r <- as.integer(shape[4]) # Response functional "width"
|
||||
} else if (length(dim(Fy)) == 3L) {
|
||||
k <- dim(Fy)[2]
|
||||
r <- dim(Fy)[3]
|
||||
dim(Fy) <- c(n, k * r)
|
||||
} else {
|
||||
stop("'Fy' must be a vector, matrix or 3-tensor")
|
||||
}
|
||||
|
||||
|
||||
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
|
||||
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
|
||||
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
||||
# In case of under-determined system replace the inverse in the normal
|
||||
# equation by the Moore-Penrose Pseudo Inverse
|
||||
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
|
||||
} else {
|
||||
# Compute OLS estimate by the Normal Equation
|
||||
B <- t(solve(cpFy, crossprod(Fy, X)))
|
||||
}
|
||||
|
||||
# Decompose `B = alpha x beta` into `alpha` and `beta`
|
||||
c(alpha0, beta0) %<-% approx.kronecker(B, c(q, r), c(p, k))
|
||||
|
||||
# Compute residuals
|
||||
resid <- X - tcrossprod(Fy, kronecker(alpha0, beta0))
|
||||
|
||||
# Covariance estimate
|
||||
Delta <- crossprod(resid) / n
|
||||
|
||||
# Transformed Residuals (using `matpow` as robust inversion algo,
|
||||
# uses Moore-Penrose Pseudo Inverse in case of singular `Delta`)
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
|
||||
# Evaluate negative log-likelihood
|
||||
loss <- 0.5 * (n * log(det(Delta)) + sum(resid.trans * resid))
|
||||
|
||||
# Call history callback (logger) before the first iterate
|
||||
if (is.function(logger)) {
|
||||
logger(0L, loss, alpha0, beta0, Delta, NA)
|
||||
}
|
||||
|
||||
|
||||
### Step 2: MLE with LS solution as starting value
|
||||
a0 <- 0
|
||||
a1 <- 1
|
||||
alpha1 <- alpha0
|
||||
beta1 <- beta0
|
||||
|
||||
# main descent loop
|
||||
no.nesterov <- TRUE
|
||||
for (iter in seq_len(max.iter)) {
|
||||
if (no.nesterov) {
|
||||
# without extrapolation as fallback
|
||||
S.alpha <- alpha1
|
||||
S.beta <- beta1
|
||||
} else {
|
||||
# extrapolation using previous direction
|
||||
S.alpha <- alpha1 + ((a0 - 1) / a1) * (alpha1 - alpha0)
|
||||
S.beta <- beta1 + ((a0 - 1) / a1) * ( beta1 - beta0)
|
||||
}
|
||||
|
||||
# Extrapolated Residuals, Covariance and transformed Residuals
|
||||
resid <- X - tcrossprod(Fy, kronecker(S.alpha, S.beta))
|
||||
Delta <- crossprod(resid) / n
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
|
||||
# Sum over kronecker prod by observation (Face-Splitting Product)
|
||||
KR <- colSums(rowKronecker(Fy, resid.trans))
|
||||
dim(KR) <- c(p, q, k, r)
|
||||
|
||||
# (Nesterov) `alpha` Gradient
|
||||
R.alpha <- aperm(KR, c(2, 4, 1, 3))
|
||||
dim(R.alpha) <- c(q * r, p * k)
|
||||
grad.alpha <- c(R.alpha %*% c(S.beta))
|
||||
|
||||
# (Nesterov) `beta` Gradient
|
||||
R.beta <- aperm(KR, c(1, 3, 2, 4))
|
||||
dim(R.beta) <- c(p * k, q * r)
|
||||
grad.beta <- c(R.beta %*% c(S.alpha))
|
||||
|
||||
# Backtracking line search (Armijo type)
|
||||
# The `inner.prod` is used in the Armijo break condition but does not
|
||||
# depend on the step size.
|
||||
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
|
||||
|
||||
# backtracking loop
|
||||
for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) {
|
||||
# Update `alpha` and `beta` (note: add(+), the gradients are already
|
||||
# pointing into the negative slope direction of the loss cause they are
|
||||
# the gradients of the log-likelihood [NOT the negative log-likelihood])
|
||||
alpha.temp <- S.alpha + delta * grad.alpha
|
||||
beta.temp <- S.beta + delta * grad.beta
|
||||
|
||||
# Update Residuals, Covariance and transformed Residuals
|
||||
resid <- X - tcrossprod(Fy, kronecker(alpha.temp, beta.temp))
|
||||
Delta <- crossprod(resid) / n
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
|
||||
# Evaluate negative log-likelihood
|
||||
loss.temp <- 0.5 * (n * log(det(Delta)) + sum(resid.trans * resid))
|
||||
|
||||
# Armijo line search break condition
|
||||
if (loss.temp <= loss - 0.1 * delta * inner.prod) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
# Call logger (invoce history callback)
|
||||
if (is.function(logger)) {
|
||||
logger(iter, loss.temp, alpha.temp, beta.temp, Delta, delta)
|
||||
}
|
||||
|
||||
# Ensure descent
|
||||
if (loss.temp < loss) {
|
||||
alpha0 <- alpha1
|
||||
alpha1 <- alpha.temp
|
||||
beta0 <- beta1
|
||||
beta1 <- beta.temp
|
||||
|
||||
# check break conditions (in descent case)
|
||||
if (mean(abs(alpha1)) + mean(abs(beta1)) < eps) {
|
||||
break # basically, estimates are zero -> stop
|
||||
}
|
||||
if (inner.prod < eps * (p * q + r * k)) {
|
||||
break # mean squared gradient is smaller than epsilon -> stop
|
||||
}
|
||||
if (abs(loss.temp - loss) < eps) {
|
||||
break # decrease is too small (slow) -> stop
|
||||
}
|
||||
|
||||
loss <- loss.temp
|
||||
no.nesterov <- FALSE # always reset
|
||||
} else if (!no.nesterov) {
|
||||
no.nesterov <- TRUE # retry without momentum
|
||||
next
|
||||
} else {
|
||||
break # failed even without momentum -> stop
|
||||
}
|
||||
|
||||
# update momentum scaling
|
||||
a0 <- a1
|
||||
a1 <- nesterov.scaling(a1, iter)
|
||||
|
||||
# Set next iter starting step.size to line searched step size
|
||||
# (while allowing it to encrease)
|
||||
step.size <- 1.618034 * delta
|
||||
|
||||
}
|
||||
|
||||
list(loss = loss, alpha = alpha1, beta = beta1, Delta = Delta)
|
||||
}
|
|
@ -0,0 +1,157 @@
|
|||
#' Gradient Descent Bases Tensor Predictors method
|
||||
#'
|
||||
#' @export
|
||||
kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
|
||||
max.iter = 500L, max.line.iter = 50L, step.size = 1e-3,
|
||||
eps = .Machine$double.eps,
|
||||
logger = NULL
|
||||
) {
|
||||
|
||||
# Check if X and Fy have same number of observations
|
||||
stopifnot(nrow(X) == NROW(Fy))
|
||||
n <- nrow(X) # Number of observations
|
||||
|
||||
# Get and check predictor dimensions
|
||||
if (length(dim(X)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(X) == prod(shape[1:2]))
|
||||
p <- as.integer(shape[1]) # Predictor "height"
|
||||
q <- as.integer(shape[2]) # Predictor "width"
|
||||
} else if (length(dim(X)) == 3L) {
|
||||
p <- dim(X)[2]
|
||||
q <- dim(X)[3]
|
||||
dim(X) <- c(n, p * q)
|
||||
} else {
|
||||
stop("'X' must be a matrix or 3-tensor")
|
||||
}
|
||||
|
||||
# Get and check response dimensions
|
||||
if (!is.array(Fy)) {
|
||||
Fy <- as.array(Fy)
|
||||
}
|
||||
if (length(dim(Fy)) == 1L) {
|
||||
k <- r <- 1L
|
||||
dim(Fy) <- c(n, 1L)
|
||||
} else if (length(dim(Fy)) == 2L) {
|
||||
stopifnot(!missing(shape))
|
||||
stopifnot(ncol(Fy) == prod(shape[3:4]))
|
||||
k <- as.integer(shape[3]) # Response functional "height"
|
||||
r <- as.integer(shape[4]) # Response functional "width"
|
||||
} else if (length(dim(Fy)) == 3L) {
|
||||
k <- dim(Fy)[2]
|
||||
r <- dim(Fy)[3]
|
||||
dim(Fy) <- c(n, k * r)
|
||||
} else {
|
||||
stop("'Fy' must be a vector, matrix or 3-tensor")
|
||||
}
|
||||
|
||||
### Step 1: (Approx) Least Squares solution for `X = Fy B' + epsilon`
|
||||
cpFy <- crossprod(Fy) # TODO: Check/Test and/or replace
|
||||
if (n <= k * r || qr(cpFy)$rank < k * r) {
|
||||
# In case of under-determined system replace the inverse in the normal
|
||||
# equation by the Moore-Penrose Pseudo Inverse
|
||||
B <- t(matpow(cpFy, -1) %*% crossprod(Fy, X))
|
||||
} else {
|
||||
# Compute OLS estimate by the Normal Equation
|
||||
B <- t(solve(cpFy, crossprod(Fy, X)))
|
||||
}
|
||||
|
||||
# Decompose `B = alpha x beta` into `alpha` and `beta`
|
||||
c(alpha, beta) %<-% approx.kronecker(B, c(q, r), c(p, k))
|
||||
|
||||
# Compute residuals
|
||||
resid <- X - tcrossprod(Fy, kronecker(alpha, beta))
|
||||
|
||||
# Covariance estimate
|
||||
Delta <- crossprod(resid) / n
|
||||
|
||||
# Transformed Residuals (using `matpow` as robust inversion algo,
|
||||
# uses Moore-Penrose Pseudo Inverse in case of singular `Delta`)
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
|
||||
# Evaluate negative log-likelihood
|
||||
loss <- 0.5 * (n * log(det(Delta)) + sum(resid.trans * resid))
|
||||
|
||||
# Call history callback (logger) before the first iterate
|
||||
if (is.function(logger)) {
|
||||
logger(0L, loss, alpha, beta, Delta, NA)
|
||||
}
|
||||
|
||||
|
||||
### Step 2: MLE with LS solution as starting value
|
||||
for (iter in seq_len(max.iter)) {
|
||||
# Sum over kronecker prod by observation (Face-Splitting Product)
|
||||
KR <- colSums(rowKronecker(Fy, resid.trans))
|
||||
dim(KR) <- c(p, q, k, r)
|
||||
|
||||
# `alpha` Gradient
|
||||
R.Alpha <- aperm(KR, c(2, 4, 1, 3))
|
||||
dim(R.Alpha) <- c(q * r, p * k)
|
||||
grad.alpha <- c(R.Alpha %*% c(beta))
|
||||
|
||||
# `beta` Gradient
|
||||
R.Beta <- aperm(KR, c(1, 3, 2, 4))
|
||||
dim(R.Beta) <- c(p * k, q * r)
|
||||
grad.beta <- c(R.Beta %*% c(alpha))
|
||||
|
||||
# Line Search (Armijo type)
|
||||
# The `inner.prod` is used in the Armijo break condition but does not
|
||||
# depend on the step size.
|
||||
inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
|
||||
|
||||
# Line Search loop
|
||||
for (delta in step.size * 0.618034^seq.int(0L, len = max.line.iter)) {
|
||||
# Update `alpha` and `beta` (note: add(+), the gradients are already
|
||||
# pointing into the negative slope direction of the loss cause they are
|
||||
# the gradients of the log-likelihood [NOT the negative log-likelihood])
|
||||
alpha.temp <- alpha + delta * grad.alpha
|
||||
beta.temp <- beta + delta * grad.beta
|
||||
|
||||
# Update Residuals, Covariance and transformed Residuals
|
||||
resid <- X - tcrossprod(Fy, kronecker(alpha.temp, beta.temp))
|
||||
Delta <- crossprod(resid) / n
|
||||
resid.trans <- resid %*% matpow(Delta, -1)
|
||||
|
||||
# Evaluate negative log-likelihood
|
||||
loss.temp <- 0.5 * (n * log(det(Delta)) + sum(resid.trans * resid))
|
||||
|
||||
# Armijo line search break condition
|
||||
if (loss.temp <= loss - 0.1 * delta * inner.prod) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
# Call logger (invoce history callback)
|
||||
if (is.function(logger)) {
|
||||
logger(iter, loss.temp, alpha.temp, beta.temp, Delta, delta)
|
||||
}
|
||||
|
||||
# Ensure descent
|
||||
if (loss.temp < loss) {
|
||||
alpha <- alpha.temp
|
||||
beta <- beta.temp
|
||||
|
||||
# check break conditions (in descent case)
|
||||
if (mean(abs(alpha)) + mean(abs(beta)) < eps) {
|
||||
break # basically, estimates are zero -> stop
|
||||
}
|
||||
if (inner.prod < eps * (p * q + r * k)) {
|
||||
break # mean squared gradient is smaller than epsilon -> stop
|
||||
}
|
||||
if (abs(loss.temp - loss) < eps) {
|
||||
break # decrease is too small (slow) -> stop
|
||||
}
|
||||
|
||||
loss <- loss.temp
|
||||
} else {
|
||||
break
|
||||
}
|
||||
|
||||
# Set next iter starting step.size to line searched step size
|
||||
# (while allowing it to encrease)
|
||||
step.size <- 1.618034 * delta
|
||||
|
||||
}
|
||||
|
||||
list(loss = loss, alpha = alpha, beta = beta, Delta = Delta)
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
#' Matricization
|
||||
#'
|
||||
#' @param T multi-dimensional array of order at least \code{mode}
|
||||
#' @param mode dimension along to matricize
|
||||
#'
|
||||
#' @returns matrix of dimensions \code{dim(T)[mode]} by \code{prod(dim(T))[-mode]}
|
||||
#'
|
||||
#' @export
|
||||
mat <- function(T, mode) {
|
||||
mode <- as.integer(mode)
|
||||
|
||||
dims <- dim(T)
|
||||
if (length(dims) < mode) {
|
||||
stop("Mode must be a pos. int. smaller equal than the tensor order")
|
||||
}
|
||||
|
||||
if (mode > 1L) {
|
||||
T <- aperm(T, c(mode, seq_along(dims)[-mode]))
|
||||
}
|
||||
dim(T) <- c(dims[mode], prod(dims[-mode]))
|
||||
|
||||
T
|
||||
}
|
|
@ -15,7 +15,8 @@
|
|||
#'
|
||||
#' @export
|
||||
matrixImage <- function(A, add.values = FALSE,
|
||||
main = NULL, sub = NULL, interpolate = FALSE, ...
|
||||
main = NULL, sub = NULL, interpolate = FALSE, ...,
|
||||
digits = getOption("digits")
|
||||
) {
|
||||
# plot raster image
|
||||
plot(c(0, ncol(A)), c(0, nrow(A)), type = "n", bty = "n", col = "red",
|
||||
|
@ -38,8 +39,11 @@ matrixImage <- function(A, add.values = FALSE,
|
|||
|
||||
# Writes matrix values (in colored element grids)
|
||||
if (any(add.values)) {
|
||||
if (length(add.values) > 1) { A[!add.values] <- NA }
|
||||
text(rep(x - 0.5, nrow(A)), rep(rev(y - 0.5), each = ncol(A)), A,
|
||||
if (length(add.values) > 1) {
|
||||
A[!add.values] <- NA
|
||||
A[add.values] <- format(A[add.values], digits = digits)
|
||||
}
|
||||
text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A,
|
||||
adj = 0.5, col = as.integer(S > 0.65))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#' Multi-Value assigne operator.
|
||||
#' Multi-Value assign operator.
|
||||
#'
|
||||
#' @param lhs vector of variables (or variable names) to assign values.
|
||||
#' @param rhs object that can be coersed to list of the same length as
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
#' Tensor Times Matrix (n-mode tensor matrix product)
|
||||
#'
|
||||
#' @param T array of order at least \code{mode}
|
||||
#' @param M matrix, the right hand side of the mode product such that
|
||||
#' \code{ncol(M)} equals \code{dim(T)[mode]}
|
||||
#' @param mode the mode of the product in the range \code{1:length(dim(T))}
|
||||
#'
|
||||
#' @returns multi-dimensional array of the same order as \code{T} with
|
||||
#' \code{mode} dimension equal to \code{nrow(M)}
|
||||
#'
|
||||
#' @export
|
||||
ttm <- function(T, M, mode = length(dim(T))) {
|
||||
storage.mode(T) <- storage.mode(M) <- "double"
|
||||
.Call("C_ttm", T, M, as.integer(mode))
|
||||
}
|
||||
|
||||
#' @rdname ttm
|
||||
#' @export
|
||||
`%x_1%` <- function(T, M) ttm(T, M, 1L)
|
||||
#' @rdname ttm
|
||||
#' @export
|
||||
`%x_2%` <- function(T, M) ttm(T, M, 2L)
|
||||
#' @rdname ttm
|
||||
#' @export
|
||||
`%x_3%` <- function(T, M) ttm(T, M, 3L)
|
||||
#' @rdname ttm
|
||||
#' @export
|
||||
`%x_4%` <- function(T, M) ttm(T, M, 4L)
|
|
@ -0,0 +1,78 @@
|
|||
#' Kronecker decomposed Variance Matrix estimation.
|
||||
#'
|
||||
#' @description Computes the kronecker decomposition factors of the variance
|
||||
#' matrix
|
||||
#' \deqn{\var{X} = tr(L)tr(R) (L\otimes R).}
|
||||
#'
|
||||
#' @param X numeric matrix or 3d array.
|
||||
#' @param shape in case of \code{X} being a matrix, this specifies the predictor
|
||||
#' shape, otherwise ignored.
|
||||
#' @param center boolean specifying if \code{X} is centered before computing the
|
||||
#' left/right second moments. This is usefull in the case of allready centered
|
||||
#' data.
|
||||
#'
|
||||
#' @returns List containing
|
||||
#' \describe{
|
||||
#' \item{lhs}{Left Hand Side \eqn{L} of the kronecker decomposed variance matrix}
|
||||
#' \item{rhs}{Right Hand Side \eqn{R} of the kronecker decomposed variance matrix}
|
||||
#' \item{trace}{Scaling factor \eqn{tr(L)tr(R)} for the variance matrix}
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' n <- 503L # nr. of observations
|
||||
#' p <- 32L # first predictor dimension
|
||||
#' q <- 27L # second predictor dimension
|
||||
#' lhs <- 0.5^abs(outer(seq_len(q), seq_len(q), `-`)) # Left Var components
|
||||
#' rhs <- 0.5^abs(outer(seq_len(p), seq_len(p), `-`)) # Right Var components
|
||||
#' X <- rmvnorm(n, sigma = kronecker(lhs, rhs)) # Multivariate normal data
|
||||
#'
|
||||
#' # Estimate kronecker decomposed variance matrix
|
||||
#' dim(X) # c(n, p * q)
|
||||
#' fit <- var.kronecker(X, shape = c(p, q))
|
||||
#'
|
||||
#' # equivalent
|
||||
#' dim(X) <- c(n, p, q)
|
||||
#' fit <- var.kronecker(X)
|
||||
#'
|
||||
#' # Compute complete estimated variance matrix
|
||||
#' varX.hat <- fit$trace^-1 * kronecker(fit$lhs, fit$rhs)
|
||||
#'
|
||||
#' # or its inverse
|
||||
#' varXinv.hat <- fit$trace * kronecker(solve(fit$lhs), solve(fit$rhs))
|
||||
#'
|
||||
var.kronecker <- function(X, shape = dim(X)[-1], center = TRUE) {
|
||||
# Get and check predictor dimensions
|
||||
n <- nrow(X)
|
||||
if (length(dim(X)) == 2L) {
|
||||
stopifnot(ncol(X) == prod(shape[1:2]))
|
||||
p <- as.integer(shape[1]) # Predictor "height"
|
||||
q <- as.integer(shape[2]) # Predictor "width"
|
||||
dim(X) <- c(n, p, q)
|
||||
} else if (length(dim(X)) == 3L) {
|
||||
p <- dim(X)[2]
|
||||
q <- dim(X)[3]
|
||||
} else {
|
||||
stop("'X' must be a matrix or 3-tensor")
|
||||
}
|
||||
|
||||
if (isTRUE(center)) {
|
||||
# Center X; X[, i, j] <- X[, i, j] - mean(X[, i, j])
|
||||
X <- scale(X, scale = FALSE)
|
||||
|
||||
print(range(attr(X, "scaled:center")))
|
||||
|
||||
dim(X) <- c(n, p, q)
|
||||
}
|
||||
|
||||
# Calc left/right side of kronecker structures covariance
|
||||
# var(X) = var.lhs %x% var.rhs
|
||||
var.lhs <- .rowMeans(apply(X, 1, crossprod), q * q, n)
|
||||
dim(var.lhs) <- c(q, q)
|
||||
var.rhs <- .rowMeans(apply(X, 1, tcrossprod), p * p, n)
|
||||
dim(var.rhs) <- c(p, p)
|
||||
|
||||
# Estimate scalling factor tr(var(X)) = tr(var.lhs) tr(var.rhs)
|
||||
trace <- sum(X^2) / n
|
||||
|
||||
list(lhs = var.lhs, rhs = var.rhs, trace = trace)
|
||||
}
|
|
@ -0,0 +1,2 @@
|
|||
|
||||
PKG_LIBS = $(BLAS_LIBS) $(FLIBS)
|
|
@ -0,0 +1,23 @@
|
|||
#include <R.h>
|
||||
#include <Rinternals.h>
|
||||
#include <R_ext/Rdynload.h>
|
||||
|
||||
// extern SEXP FastPOI_C_sub(SEXP in_B, SEXP in_Delta,
|
||||
// SEXP in_lambda, SEXP in_maxit, SEXP in_tol
|
||||
// );
|
||||
|
||||
/* Tensor Times Matrix a.k.a. Mode Product */
|
||||
extern SEXP ttm(SEXP A, SEXP X, SEXP mode);
|
||||
|
||||
/* List of registered routines (e.g. C entry points) */
|
||||
static const R_CallMethodDef CallEntries[] = {
|
||||
// {"FastPOI_C_sub", (DL_FUNC) &FastPOI_C_sub, 5}, // NOT USED
|
||||
{"C_ttm", (DL_FUNC) &ttm, 3},
|
||||
{NULL, NULL, 0}
|
||||
};
|
||||
|
||||
/* Restrict C entry points to registered routines. */
|
||||
void R_init_tensorPredictors(DllInfo *dll) {
|
||||
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
|
||||
R_useDynamicSymbols(dll, FALSE);
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
// The need for `USE_FC_LEN_T` and `FCONE` is due to a Fortran character string
|
||||
// to C incompatibility. See: Writing R Extentions: 6.6.1 Fortran character strings
|
||||
#define USE_FC_LEN_T
|
||||
#include <R.h>
|
||||
#include <Rinternals.h>
|
||||
#include <R_ext/BLAS.h>
|
||||
#ifndef FCONE
|
||||
#define FCONE
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Tensor Times Matrix a.k.a. Mode Product
|
||||
*
|
||||
* @param A multi-dimensionl array
|
||||
* @param B matrix
|
||||
* @param m mode index (1-indexed)
|
||||
*/
|
||||
extern SEXP ttm(SEXP A, SEXP B, SEXP m) {
|
||||
|
||||
// get zero indexed mode
|
||||
int mode = asInteger(m) - 1;
|
||||
|
||||
// get dimension attribute of A
|
||||
SEXP dim = getAttrib(A, R_DimSymbol);
|
||||
|
||||
// validate mode (mode must be smaller than the nr of dimensions)
|
||||
if (mode < 0 || length(dim) <= mode) {
|
||||
error("Illegal mode");
|
||||
}
|
||||
|
||||
// and check if B is a matrix of non degenetate size
|
||||
if (!isMatrix(B)) {
|
||||
error("Expected a matrix as second argument");
|
||||
}
|
||||
if (!nrows(B) || !ncols(B)) {
|
||||
error("Zero dimension detected");
|
||||
}
|
||||
|
||||
// check matching of dimensions
|
||||
if (INTEGER(dim)[mode] != ncols(B)) {
|
||||
error("Dimension missmatch (mode dim not equal to ncol)");
|
||||
}
|
||||
|
||||
// calc nr of response elements `prod(dim(X)[-mode]) * ncol(X)`,
|
||||
int leny = 1;
|
||||
// and the strides
|
||||
// `stride[0] <- prod(dim(X)[seq_len(mode - 1)])`
|
||||
// `stride[1] <- dim(X)[mode]`
|
||||
// `stride[2] <- prod(dim(X)[-seq_len(mode)])`
|
||||
int stride[3] = {1, INTEGER(dim)[mode], 1};
|
||||
for (int i = 0; i < length(dim); ++i) {
|
||||
int size = INTEGER(dim)[i];
|
||||
// check for non-degenetate dimensions
|
||||
if (!size) {
|
||||
error("Zero dimension detected");
|
||||
}
|
||||
leny *= (i == mode) ? nrows(B) : size;
|
||||
stride[0] *= (i < mode) ? size : 1;
|
||||
stride[2] *= (i > mode) ? size : 1;
|
||||
}
|
||||
|
||||
// as well as the matrix rows (cols only needed once)
|
||||
int nrow = nrows(B);
|
||||
|
||||
// create response object C
|
||||
SEXP C = PROTECT(allocVector(REALSXP, leny));
|
||||
|
||||
// raw data access pointers
|
||||
double* a = REAL(A);
|
||||
double* b = REAL(B);
|
||||
double* c = REAL(C);
|
||||
|
||||
// Tensor Times Matrix / Mode Product
|
||||
const double zero = 0.0;
|
||||
const double one = 1.0;
|
||||
if (mode == 0) {
|
||||
// mode 1: (A x_1 B)_(1) = B A_(1) as a single Matrix-Matrix prod
|
||||
F77_CALL(dgemm)("N", "N", &nrow, &stride[2], &stride[1], &one,
|
||||
b, &nrow, a, &stride[1],
|
||||
&zero, c, &nrow FCONE FCONE);
|
||||
} else {
|
||||
// Other modes can be written as blocks of matrix multiplications
|
||||
for (int i2 = 0; i2 < stride[2]; ++i2) {
|
||||
F77_CALL(dgemm)("N", "T", &stride[0], &nrow, &stride[1], &one,
|
||||
&a[i2 * stride[0] * stride[1]], &stride[0], b, &nrow,
|
||||
&zero, &c[i2 * stride[0] * nrow], &stride[0] FCONE FCONE);
|
||||
}
|
||||
}
|
||||
/*
|
||||
// Tensor Times Matrix / Mode Product (reference implementation)
|
||||
memset(c, 0, leny * sizeof(double));
|
||||
for (int i2 = 0; i2 < stride[2]; ++i2) {
|
||||
for (int i1 = 0; i1 < stride[1]; ++i1) { // stride[1] == ncols(B)
|
||||
for (int j = 0; j < nrow; ++j) {
|
||||
for (int i0 = 0; i0 < stride[0]; ++i0) {
|
||||
c[i0 + (j + i2 * nrow) * stride[0]] +=
|
||||
a[i0 + (i1 + i2 * stride[1]) * stride[0]] * b[j + i1 * nrow];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
// finally, set result dimensions
|
||||
SEXP newdim = PROTECT(allocVector(INTSXP, length(dim)));
|
||||
for (int i = 0; i < length(dim); ++i) {
|
||||
INTEGER(newdim)[i] = (i == mode) ? nrows(B) : INTEGER(dim)[i];
|
||||
}
|
||||
setAttrib(C, R_DimSymbol, newdim);
|
||||
|
||||
// release C to the hands of the garbage collector
|
||||
UNPROTECT(2);
|
||||
|
||||
return C;
|
||||
}
|
Loading…
Reference in New Issue