diff --git a/LaTeX/main.bib b/LaTeX/main.bib new file mode 100644 index 0000000..b7ac7ab --- /dev/null +++ b/LaTeX/main.bib @@ -0,0 +1,63 @@ +@article{RegMatrixReg-ZhouLi2014, + author = {Zhou, Hua and Li, Lexin}, + title = {Regularized matrix regression}, + journal = {Journal of the Royal Statistical Society. Series B (Statistical Methodology)}, + volume = {76}, + number = {2}, + pages = {463--483}, + year = {2014}, + publisher = {[Royal Statistical Society, Wiley]} +} + +@book{StatInf-CasellaBerger2002, + title = {{Statistical Inference}}, + author = {Casella, George and Berger, Roger L.}, + isbn = {0-534-24312-6}, + series = {Duxbury Advanced Series}, + year = {2002}, + edition = {2}, + publisher = {Thomson Learning} +} + +@book{MatrixDiffCalc-MagnusNeudecker1999, + title = {Matrix Differential Calculus with Applications in Statistics and Econometrics (Revised Edition)}, + author = {Magnus, Jan R. and Neudecker, Heinz}, + year = {1999}, + publisher = {John Wiley \& Sons Ltd}, + isbn = {0-471-98632-1} +} + +@book{MatrixAlgebra-AbadirMagnus2005, + title = {Matrix Algebra}, + author = {Abadir, Karim M. and Magnus, Jan R.}, + year = {2005}, + publisher = {Cambridge University Press}, + series = {Econometric Exercises}, + collection = {Econometric Exercises}, + place = {Cambridge}, + doi = {10.1017/CBO9780511810800} +} + +@article{TensorDecomp-HuLeeWang2022, + author = {Hu, Jiaxin and Lee, Chanwoo and Wang, Miaoyan}, + title = {Generalized Tensor Decomposition With Features on Multiple Modes}, + journal = {Journal of Computational and Graphical Statistics}, + volume = {31}, + number = {1}, + pages = {204-218}, + year = {2022}, + publisher = {Taylor \& Francis}, + doi = {10.1080/10618600.2021.1978471}, +} + +@article{CovarEstSparseKron-LengPan2018, + author = {Leng, Chenlei and Pan, Guangming}, + title = {{Covariance estimation via sparse Kronecker structures}}, + volume = {24}, + journal = {Bernoulli}, + number = {4B}, + publisher = {Bernoulli Society for Mathematical Statistics and Probability}, + pages = {3833 -- 3863}, + year = {2018}, + doi = {10.3150/17-BEJ980} +} diff --git a/LaTeX/main.tex b/LaTeX/main.tex new file mode 100644 index 0000000..fcb5d8f --- /dev/null +++ b/LaTeX/main.tex @@ -0,0 +1,630 @@ +\documentclass[a4paper, 10pt]{article} + +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{fullpage} +\usepackage{amsmath, amssymb, amstext, amsthm} +\usepackage{bm} % \boldsymbol and italic corrections, ... +\usepackage[pdftex]{hyperref} +\usepackage{makeidx} % Index (Symbols, Names, ...) +\usepackage{xcolor, graphicx} % colors and including images +\usepackage{tikz} +\usepackage[ + % backend=bibtex, + style=authoryear-comp +]{biblatex} + +% Document meta into +\title{Derivation of Gradient Descent Algorithm for K-PIR} +\author{Daniel Kapla} +\date{November 24, 2021} +% Set PDF title, author and creator. +\AtBeginDocument{ + \hypersetup{ + pdftitle = {Derivation of Gradient Descent Algorithm for K-PIR}, + pdfauthor = {Daniel Kapla}, + pdfcreator = {\pdftexbanner} + } +} + +\makeindex + +% Bibliography resource(s) +\addbibresource{main.bib} + +% Setup environments +% Theorem, Lemma +\theoremstyle{plain} +\newtheorem{theorem}{Theorem} +\newtheorem{lemma}{Lemma} +\newtheorem{example}{Example} +% Definition +\theoremstyle{definition} +\newtheorem{defn}{Definition} +% Remark +\theoremstyle{remark} +\newtheorem{remark}{Remark} + +% Define math macros +\newcommand{\mat}[1]{\boldsymbol{#1}} +\newcommand{\ten}[1]{\mathcal{#1}} +\renewcommand{\vec}{\operatorname{vec}} +\DeclareMathOperator{\kron}{\otimes} % Kronecker Product +\DeclareMathOperator{\hada}{\odot} % Hadamard Product +\newcommand{\ttm}[1][n]{\times_{#1}} % n-mode product (Tensor Times Matrix) +\DeclareMathOperator{\df}{\operatorname{df}} +\DeclareMathOperator{\tr}{\operatorname{tr}} +\DeclareMathOperator{\var}{Var} +\DeclareMathOperator{\cov}{Cov} +\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}} +% \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}} +\DeclareMathOperator*{\argmin}{{arg\,min}} +\DeclareMathOperator*{\argmax}{{arg\,max}} +\newcommand{\D}{\textnormal{D}} +\renewcommand{\d}{\textnormal{d}} +\renewcommand{\t}[1]{{{#1}'}} +\newcommand{\pinv}[1]{{{#1}^{\dagger}}} % `Moore-Penrose pseudoinverse` +\newcommand{\todo}[1]{{\color{red}TODO: #1}} + +\begin{document} + +\maketitle + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% Introduction %%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Introduction} +We assume the model +\begin{displaymath} + \mat{X} = \mat{\mu} + \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon} +\end{displaymath} +where the dimensions of all the components are listed in Table~\ref{tab:dimensions}. +and its vectorized form +\begin{displaymath} + \vec\mat{X} = \vec\mat{\mu} + (\mat{\alpha}\kron\mat{\beta})\vec\mat{f}_y + \vec\mat{\epsilon} +\end{displaymath} + +\begin{table}[!htp] + \centering + \begin{minipage}{0.8\textwidth} + \centering + \begin{tabular}{l l} + $\mat X, \mat\mu, \mat R, \mat\epsilon$ & $p\times q$ \\ + $\mat{f}_y$ & $k\times r$ \\ + $\mat\alpha$ & $q\times r$ \\ + $\mat\beta$ & $p\times k$ \\ + $\mat\Delta$ & $p q\times p q$ \\ + $\mat\Delta_1$ & $q\times q$ \\ + $\mat\Delta_2$ & $p\times p$ \\ + $\mat{r}$ & $p q\times 1$ \\ + \hline + $\ten{X}, \ten{R}$ & $n\times p\times q$ \\ + $\ten{F}$ & $n\times k\times r$ \\ + \end{tabular} + \caption{\label{tab:dimensions}\small Summary listing of dimensions with the corresponding sample versions $\mat{X}_i, \mat{R}_i, \mat{r}_i, \mat{f}_{y_i}$ for $i = 1, ..., n$ as well as estimates $\widehat{\mat{\alpha}}, \widehat{\mat{\beta}}, \widehat{\mat\Delta}, \widehat{\mat\Delta}_1$ and $\widehat{\mat\Delta}_2$.} + \end{minipage} +\end{table} + +The log-likelihood $l$ given $n$ i.i.d. observations assuming that $\mat{X}_i\mid(Y = y_i)$ is normal distributed as +\begin{displaymath} + \vec\mat{X}_i \sim \mathcal{N}_{p q}(\vec\mat\mu + (\mat\alpha\kron\mat\beta)\vec\mat{f}_{y_i}, \Delta) +\end{displaymath} +Replacing all unknown by there estimates gives the (estimated) log-likelihood +\begin{equation}\label{eq:log-likelihood-est} + \hat{l}(\mat\alpha, \mat\beta) = -\frac{n q p}{2}\log 2\pi - \frac{n}{2}\log|\widehat{\mat\Delta}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat\Delta}^{-1}\mat{r}_i +\end{equation} +where the residuals are +\begin{displaymath} + \mat{r}_i = \vec\mat{X}_i - \vec\overline{\mat{X}} - (\mat\alpha\kron\mat\beta)\vec{\mat f}_{y_i}\qquad (p q \times 1) +\end{displaymath} +and the MLE estimate assuming $\mat\alpha, \mat\beta$ known for the covariance matrix $\widehat{\mat\Delta}$ as solution to the score equations is +\begin{equation}\label{eq:Delta} + \widehat{\mat\Delta} = \frac{1}{n}\sum_{i = 1}^n \mat{r}_i \t{\mat{r}_i} \qquad(p q \times p q). +\end{equation} +Note that the log-likelihood estimate $\hat{l}$ only depends on $\mat\alpha, \mat\beta$. Next, we compute the gradient for $\mat\alpha$ and $\mat\beta$ of $\hat{l}$ used to formulate a Gradient Descent base estimation algorithm for $\mat\alpha, \mat\beta$ as the previous algorithmic. The main reason is to enable an estimation for bigger dimensions of the $\mat\alpha, \mat\beta$ coefficients since the previous algorithm does \emph{not} solve the high run time problem for bigger dimensions. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% Derivative %%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Derivative of the Log-Likelihood} +Start with the general case of $\mat X_i|(Y_i = y_i)$ is multivariate normal distributed with the covariance $\mat\Delta$ being a $p q\times p q$ positive definite symmetric matrix \emph{without} an further assumptions. We have $i = 1, ..., n$ observations following +\begin{displaymath} + \mat{r}_i = \vec(\mat X_i - \mat\mu - \mat\beta\mat{f}_{y_i}\t{\mat\alpha}) \sim \mathcal{N}_{p q}(\mat 0, \mat\Delta). +\end{displaymath} +The MLE estimates of $\mat\mu, \mat\Delta$ are +\begin{displaymath} + \widehat{\mat\mu} = \overline{\mat X} = \frac{1}{n}\sum_{i = 1}^n \mat X_i {\color{gray}\qquad(p\times q)}, + \qquad \widehat{\mat\Delta} = \frac{1}{n}\sum_{i = 1}^n \mat r_i\t{\mat r_i} {\color{gray}\qquad(p q\times p q)}. +\end{displaymath} +Substitution of the MLE estimates into the log-likelihood $l(\mat\mu, \mat\Delta, \mat\alpha, \mat\beta)$ gives the estimated log-likelihood $\hat{l}(\mat\alpha, \mat\beta)$ as +\begin{displaymath} + \hat{l}(\mat\alpha, \mat\beta) = -\frac{n q p}{2}\log 2\pi - \frac{n}{2}\log|\widehat{\mat\Delta}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat\Delta}^{-1}\mat{r}_i. +\end{displaymath} +We are interested in the gradients $\nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta)$, $\nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)$ of the estimated log-likelihood. Therefore, we consider the differential of $\hat{l}$. +\begin{align} + \d\hat{l}(\mat\alpha, \mat\beta) + &= -\frac{n}{2}\log|\widehat{\mat{\Delta}}| - \frac{1}{2}\sum_{i = 1}^n \big(\t{(\d \mat{r}_i)}\widehat{\mat{\Delta}}^{-1} \mat{r}_i + \t{\mat{r}_i}(\d\widehat{\mat{\Delta}}^{-1}) \mat{r}_i + \t{\mat{r}_i}\widehat{\mat{\Delta}}^{-1} \d \mat{r}_i\big) \nonumber\\ + &= \underbrace{-\frac{n}{2}\log|\widehat{\mat{\Delta}}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}(\d\widehat{\mat{\Delta}}^{-1}) \mat{r}_i}_{=\,0\text{ due to }\widehat{\mat{\Delta}}\text{ beeing the MLE}} \label{eq:deriv1} + - \sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat{\Delta}}^{-1} \d \mat{r}_i. +\end{align} +The next step is to compute $\d \mat{r}_i$ which depends on both $\mat\alpha$ and $\mat\beta$ +\begin{align*} + \d\mat{r}_i(\mat\alpha, \mat\beta) + &= -\d(\mat\alpha\kron \mat\beta)\vec\mat{f}_{y_i} \\ + &= -\vec\!\big( \mat{I}_{p q}\,\d(\mat\alpha\kron \mat\beta)\vec\mat{f}_{y_i} \big) \\ + &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})\,\d\vec(\mat\alpha\kron \mat\beta) \\ +\intertext{using the identity \ref{eq:vecKron}, to obtain vectorized differentials, gives} + \dots + &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \,\d(\vec \mat\alpha\kron \vec \mat\beta) \\ + &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big((\d\vec \mat\alpha)\kron \vec \mat\beta + \vec \mat\alpha\kron (\d\vec \mat\beta)\big) \\ + &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big(\mat{I}_{r q}(\d\vec \mat\alpha)\kron (\vec \mat\beta)\mat{I}_1 + (\vec \mat\alpha)\mat{I}_1\kron \mat{I}_{k p}(\d\vec \mat\beta)\big) \\ + &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big((\mat{I}_{r q}\kron\vec \mat\beta)\d\vec \mat\alpha + (\vec \mat\alpha\kron \mat{I}_{k p})\d\vec \mat\beta\big) +\end{align*} +Now, substitution of $\d\mat{r}_i$ into \eqref{eq:deriv1} gives the gradients (not dimension standardized versions of $\D\hat{l}(\mat\alpha)$, $\D\hat{l}(\mat\beta)$) by identification of the derivatives from the differentials (see: \todo{appendix}) +\begin{align*} + \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta) &= + \sum_{i = 1}^n (\t{\vec(\mat{f}_{y_i})}\kron\t{\mat{r}_i}\widehat{\mat\Delta}^{-1}) (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) (\mat{I}_{r q}\kron\vec \mat\beta), + {\color{gray}\qquad(q\times r)} \\ + \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta) &= + \sum_{i = 1}^n (\t{\vec(\mat{f}_{y_i})}\kron\t{\mat{r}_i}\widehat{\mat\Delta}^{-1}) (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) (\vec \mat\alpha\kron \mat{I}_{k p}). + {\color{gray}\qquad(p\times k)} +\end{align*} +These quantities are very verbose as well as completely unusable for an implementation. By detailed analysis of the gradients we see that the main parts are only element permutations with a high sparsity. By defining the following compact matrix +\begin{equation}\label{eq:permTransResponse} + \mat G = \vec^{-1}_{q r}\bigg(\Big( \sum_{i = 1}^n \vec\mat{f}_{y_i}\otimes \widehat{\mat\Delta}^{-1}\mat{r}_i \Big)_{\pi(i)}\bigg)_{i = 1}^{p q k r}{\color{gray}\qquad(q r \times p k)} +\end{equation} +with $\pi$ being a permutation of $p q k r$ elements corresponding to permuting the axis of a 4D tensor of dimensions $p\times q\times k\times r$ by $(2, 4, 1, 3)$. As a generalization of transposition this leads to a rearrangement of the elements corresponding to the permuted 4D tensor with dimensions $q\times r\times p\times k$ which is then vectorized and reshaped into a matrix of dimensions $q r \times p k$. With $\mat G$ the gradients simplify to \todo{validate this mathematically} +\begin{align*} + \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta) &= + \vec_{q}^{-1}(\mat{G}\vec{\mat\beta}), + {\color{gray}\qquad(q\times r)} \\ + \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta) &= + \vec_{p}^{-1}(\t{\mat{G}}\vec{\mat\alpha}). + {\color{gray}\qquad(p\times k)} +\end{align*} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% Kronecker Covariance Structure %%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Kronecker Covariance Structure} +Now we assume the residuals covariance has the form $\mat\Delta = \mat\Delta_1\otimes\mat\Delta_2$ where $\mat\Delta_1$, $\mat\Delta_2$ are $q\times q$, $p\times p$ covariance matrices, respectively. This is analog to the case that $\mat{R}_i$'s are i.i.d. Matrix Normal distribution +\begin{displaymath} + \mat{R}_i = \mat{X}_i - \mat\mu - \mat\beta\mat{f}_{y_i}\t{\mat\alpha} \sim \mathcal{MN}_{p\times q}(\mat 0, \mat\Delta_2, \mat\Delta_1). +\end{displaymath} +The density of the Matrix Normal (with mean zero) is equivalent to the vectorized quantities being multivariate normal distributed with Kronecker structured covariance +\begin{align*} + f(\mat R) + &= \frac{1}{\sqrt{(2\pi)^{p q}|\mat\Delta|}}\exp\left(-\frac{1}{2}\t{\vec(\mat{R})} \mat\Delta^{-1}\vec(\mat{R})\right) \\ + &= \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{\mat{R}}\mat\Delta_2^{-1}\mat{R})\right) +\end{align*} +which leads for given data to the log-likelihood +\begin{displaymath} + l(\mat{\mu}, \mat\Delta_1, \mat\Delta_2) = + -\frac{n p q}{2}\log 2\pi + -\frac{n p}{2}\log|\mat{\Delta}_1| + -\frac{n q}{2}\log|\mat{\Delta}_2| + -\frac{1}{2}\sum_{i = 1}^n \tr(\mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i). +\end{displaymath} +\subsection{MLE covariance estimates} +Out first order of business is to derive the MLE estimated of the covariance matrices $\mat\Delta_1$, $\mat\Delta_2$ (the mean estimate $\widehat{\mat\mu}$ is trivial). Therefore, we look at the differentials with respect to changes in the covariance matrices as +\begin{align*} + \d l(\mat\Delta_1, \mat\Delta_2) &= + -\frac{n p}{2}\d\log|\mat{\Delta}_1| + -\frac{n q}{2}\d\log|\mat{\Delta}_2| + -\frac{1}{2}\sum_{i = 1}^n + \tr( (\d\mat\Delta_1^{-1})\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i + + \mat\Delta_1^{-1}\t{\mat{R}_i}(\d\mat\Delta_2^{-1})\mat{R}_i) \\ + &= + -\frac{n p}{2}\tr\mat{\Delta}_1^{-1}\d\mat{\Delta}_1 + -\frac{n q}{2}\tr\mat{\Delta}_2^{-1}\d\mat{\Delta}_2 \\ + &\qquad\qquad + +\frac{1}{2}\sum_{i = 1}^n + \tr( \mat\Delta_1^{-1}(\d\mat\Delta_1)\mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i + + \mat\Delta_1^{-1}\t{\mat{R}_i}\mat\Delta_2^{-1}(\d\mat\Delta_2)\mat\Delta_2^{-1}\mat{R}_i) \\ + &= \frac{1}{2}\tr\!\Big(\Big( + -n p \mat{I}_q + \mat\Delta_1^{-1}\sum_{i = 1}^n \t{\mat{R}_i}\mat\Delta_2^{-1}\mat{R}_i + \Big)\mat{\Delta}_1^{-1}\d\mat{\Delta}_1\Big) \\ + &\qquad\qquad + + \frac{1}{2}\tr\!\Big(\Big( + -n q \mat{I}_p + \mat\Delta_2^{-1}\sum_{i = 1}^n \mat{R}_i\mat\Delta_1^{-1}\t{\mat{R}_i} + \Big)\mat{\Delta}_2^{-1}\d\mat{\Delta}_2\Big) \overset{!}{=} 0. +\end{align*} +Setting $\d l$ to zero yields the MLE estimates as +\begin{displaymath} + \widehat{\mat{\mu}} = \overline{\mat X}{\color{gray}\quad(p\times q)}, \qquad + \widehat{\mat\Delta}_1 = \frac{1}{n p}\sum_{i = 1}^n \t{\mat{R}_i}\widehat{\mat\Delta}_2^{-1}\mat{R}_i{\color{gray}\quad(q\times q)}, \qquad + \widehat{\mat\Delta}_2 = \frac{1}{n q}\sum_{i = 1}^n \mat{R}_i\widehat{\mat\Delta}_1^{-1}\t{\mat{R}_i}{\color{gray}\quad(p\times p)}. +\end{displaymath} +Next, analog to above, we take the estimated log-likelihood and derive gradients with respect to $\mat{\alpha}$, $\mat{\beta}$. +The estimated log-likelihood derives by replacing the unknown covariance matrices by there MLE estimates leading to +\begin{displaymath} + \hat{l}(\mat\alpha, \mat\beta) = + -\frac{n p q}{2}\log 2\pi + -\frac{n p}{2}\log|\widehat{\mat{\Delta}}_1| + -\frac{n q}{2}\log|\widehat{\mat{\Delta}}_2| + -\frac{1}{2}\sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i) +\end{displaymath} +and its differential +\begin{displaymath} + \d\hat{l}(\mat\alpha, \mat\beta) = + -\frac{n p}{2}\d\log|\widehat{\mat{\Delta}}_1| + -\frac{n q}{2}\d\log|\widehat{\mat{\Delta}}_2| + -\frac{1}{2}\sum_{i = 1}^n \d\tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i). +\end{displaymath} +We first take a closer look at the sum. After a bit of algebra using $\d\mat A^{-1} = -\mat A^{-1}(\d\mat A)\mat A^{-1}$ and the definitions of $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$ the sum can be rewritten +\begin{displaymath} + \frac{1}{2}\sum_{i = 1}^n \d\tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i) + = \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\d\mat{R}_i) + - \frac{np}{2}\d\log|\widehat{\mat\Delta}_1| + - \frac{nq}{2}\d\log|\widehat{\mat\Delta}_2|. +\end{displaymath} +This means that most of the derivative cancels out and we get +\begin{align*} + \d\hat{l}(\mat\alpha, \mat\beta) + &= \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\d\mat{R}_i) \\ + &= \sum_{i = 1}^n \tr(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}((\d\mat\beta)\mat{f}_{y_i}\t{\mat\alpha} + \mat\beta\mat{f}_{y_i}\t{(\d\mat\alpha}))) \\ + &= \sum_{i = 1}^n \t{\vec(\widehat{\mat{\Delta}}_2^{-1}\mat{R}_i\widehat{\mat{\Delta}}_1^{-1}\mat\alpha\t{\mat{f}_{y_i}})}\d\vec\mat\beta + + \sum_{i = 1}^n \t{\vec(\widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat\beta\mat{f}_{y_i})}\d\vec\mat\alpha +\end{align*} +which means the gradients are +\begin{align*} + \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta) + &= \sum_{i = 1}^n \widehat{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widehat{\mat{\Delta}}_2^{-1}\mat\beta\mat{f}_{y_i} + = (\ten{R}\ttm[3]\widehat{\mat{\Delta}}_1^{-1}\ttm[2]\widehat{\mat{\Delta}}_2^{-1})_{(3)}\t{(\ten{F}\ttm[2]\mat\beta)_{(3)}}\\ + \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta) + &= \sum_{i = 1}^n \widehat{\mat{\Delta}}_2^{-1}\mat{R}_i\widehat{\mat{\Delta}}_1^{-1}\mat\alpha\t{\mat{f}_{y_i}} + = (\ten{R}\ttm[3]\widehat{\mat{\Delta}}_1^{-1}\ttm[2]\widehat{\mat{\Delta}}_2^{-1})_{(2)}\t{(\ten{F}\ttm[3]\mat\alpha)_{(2)}} +\end{align*} + +\paragraph{Comparison to the general case:} There are two main differences, first the general case has a closed form solution for the gradient due to the explicit nature of the MLE estimate of $\widehat{\mat\Delta}$ compared to the mutually dependent MLE estimates $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$. On the other hand the general case has dramatically bigger dimensions of the covariance matrix ($p q \times p q$) compared to the two Kronecker components with dimensions $q \times q$ and $p \times p$. This means that in the general case there is a huge performance penalty in the dimensions of $\widehat{\mat\Delta}$ while in the other case an extra estimation is required to determine $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% Alternative covariance estimates %%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Alternative covariance estimates} +An alternative approach is \emph{not} to use the MLE estimates for $\mat\Delta_1$, $\mat\Delta_2$ but (up to scaling) unbiased estimates. +\begin{displaymath} + \widetilde{\mat\Delta}_1 = \frac{1}{n}\sum_{i = 1}^n \t{\mat{R}_i}\mat{R}_i {\color{gray}\quad(q\times q)},\qquad + \widetilde{\mat\Delta}_2 = \frac{1}{n}\sum_{i = 1}^n \mat{R}_i\t{\mat{R}_i} {\color{gray}\quad(p\times p)}. +\end{displaymath} +The unbiasednes comes directly from the following short computation; +\begin{displaymath} + (\E\widetilde{\mat\Delta}_1)_{j,k} = \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p \E \mat{R}_{i,l,j}\mat{R}_{i,l,k} + = \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^p (\mat{\Delta}_{2})_{l,l}(\mat{\Delta}_{1})_{j,k} + = (\mat\Delta_1\tr(\mat\Delta_2))_{j,k}. +\end{displaymath} +which means that $\E\widetilde{\mat\Delta}_1 = \mat\Delta_1\tr(\mat\Delta_2)$ and in analogy $\E\widetilde{\mat\Delta}_2 = \mat\Delta_2\tr(\mat\Delta_1)$. Now, we need to handle the scaling which can be estimated unbiased by +\begin{displaymath} + \tilde{s} = \frac{1}{n}\sum_{i = 1}^n \|\mat{R}_i\|_F^2 +\end{displaymath} +because with $\|\mat{R}_i\|_F^2 = \tr \mat{R}_i\t{\mat{R}_i} = \tr \t{\mat{R}_i}\mat{R}_i$ the scale estimate $\tilde{s} = \tr(\widetilde{\mat\Delta}_1) = \tr(\widetilde{\mat\Delta}_2)$. Then $\E\tilde{s} = \tr(\E\widetilde{\mat\Delta}_1) = \tr{\mat\Delta}_1 \tr{\mat\Delta}_2 = \tr({\mat\Delta}_1\otimes{\mat\Delta}_2)$. Leading to the estimate of the covariance as +\begin{displaymath} + \widetilde{\mat\Delta} = \tilde{s}^{-1}(\widetilde{\mat{\Delta}}_1\otimes\widetilde{\mat{\Delta}}_2) +\end{displaymath} + +\todo{ prove they are consistent, especially $\widetilde{\mat\Delta} = \tilde{s}^{-1}(\widetilde{\mat\Delta}_1\otimes\widetilde{\mat\Delta}_2)$!} + +The hoped for a benefit is that these covariance estimates are in a closed form which means there is no need for an additional iterative estimations step. Before we start with the derivation of the gradients define the following two quantities +\begin{displaymath} + \mat{S}_1 = \frac{1}{n}\sum_{i = 1}^n \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\quad{\color{gray}(q\times q)}, \qquad + \mat{S}_2 = \frac{1}{n}\sum_{i = 1}^n \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\quad{\color{gray}(p\times p)}. +\end{displaymath} + +Now, the matrix normal with the covariance matrix of the vectorized quantities of the form $\mat{\Delta} = s^{-1}(\mat{\Delta}_1\otimes\mat{\Delta}_2)$ has the form +\begin{align*} + f(\mat R) + &= \frac{1}{\sqrt{(2\pi)^{p q}|\mat\Delta|}}\exp\left(-\frac{1}{2}\t{\vec(\mat{R})} \mat\Delta^{-1}\vec(\mat{R})\right) \\ + &= \frac{s^{p q / 2}}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{s}{2}\tr(\mat\Delta_1^{-1}\t{\mat{R}}\mat\Delta_2^{-1}\mat{R})\right) +\end{align*} + +The approximated log-likelihood is then +\begin{align*} + \tilde{l}(\mat\alpha, \mat\beta) + &= + -\frac{n p q}{2}\log{2\pi} + -\frac{n}{2}\log|\widetilde{\mat{\Delta}}| + -\frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widetilde{\mat{\Delta}}^{-1}\mat{r}_i \\ + &= + -\frac{n p q}{2}\log{2\pi} + +\frac{n p q}{2}\log\tilde{s} + -\frac{n p}{2}\log|\widetilde{\mat{\Delta}}_1| + -\frac{n q}{2}\log|\widetilde{\mat{\Delta}}_2| + -\frac{\tilde{s}}{2}\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i). +\end{align*} +The second form is due to the property of the determinant for scaling and the Kronecker product giving that $|\widetilde{\mat\Delta}| = (\tilde{s}^{-1})^{p q}|\widetilde{\mat{\Delta}}_1|^p |\widetilde{\mat{\Delta}}_2|^q$ as well as an analog Kronecker decomposition as in the MLE case. + +Note that with the following holds +\begin{displaymath} + \sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i) + = n \tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) + = n \tr(\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2) + = n \tr(\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}) + = n \tr(\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1}). +\end{displaymath} + +The derivation of the Gradient of the approximated log-likelihood $\tilde{l}$ is tedious but straight forward. We tackle the summands separately; +\begin{align*} + \d\log\tilde{s} &= \tilde{s}^{-1}\d\tilde{s} = \frac{2}{n\tilde{s}}\sum_{i = 1}^n \tr(\t{\mat{R}_i}\d\mat{R}_i) + = -\frac{2}{n\tilde{s}}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}), \\ + \d\log|\widetilde{\mat{\Delta}}_1| &=\tr(\widetilde{\mat{\Delta}}_1^{-1}\d\widetilde{\mat{\Delta}}_1) = \frac{2}{n}\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\d\mat{R}_i) + = -\frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\d\mat{\beta}), \\ + \d\log|\widetilde{\mat{\Delta}}_2| &=\tr(\widetilde{\mat{\Delta}}_2^{-1}\d\widetilde{\mat{\Delta}}_2) = \frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\d\mat{R}_i) + = -\frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\d\mat{\beta}) +\end{align*} +as well as +\begin{displaymath} + \d\,\tilde{s}\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i) + = (\d\tilde{s})\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i) + + \tilde{s}\, \d \sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i). +\end{displaymath} +We have +\begin{displaymath} + \d\tilde{s} = -\frac{2}{n}\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}) +\end{displaymath} +and the remaining term +\begin{align*} + \d\sum_{i = 1}^n\tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i) + = 2\sum_{i = 1}^n \tr(&\t{\mat{f}_{y_i}}\t{\mat{\beta }}(\mat{R}_i \widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1} + \widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i - \widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i \widetilde{\mat{\Delta}}_1^{-1})\d\mat{\alpha} \\ + +\,&\mat{f}_{y_i} \t{\mat{\alpha}}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1})\d\mat{\beta }). +\end{align*} +The last one is tedious but straight forward. Its computation extensively uses the symmetry of $\widetilde{\mat{\Delta}}_1$, $\widetilde{\mat{\Delta}}_2$, the cyclic property of the trace and the relation $\d\mat{A}^{-1} = -\mat{A}^{-1}(\d\mat{A})\mat{A}^{-1}$. + +Putting it all together +\begin{align*} + \d\tilde{l}(\mat{\alpha}, \mat{\beta}) + &= \frac{n p q}{2}\Big(-\frac{2}{n\tilde{s}}\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}) \\ + &\hspace{3em} - \frac{n p}{2}\Big(-\frac{2}{n}\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\d\mat{\beta}) \\ + &\hspace{3em} - \frac{n q}{2}\Big(-\frac{2}{n}\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\d\mat{\beta}) \\ + &\hspace{3em} -\frac{1}{2}\Big(-\frac{2}{n}\Big)\Big(\sum_{i = 1}^n \tr(\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i)\Big)\sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{R}_i\d\mat{\alpha} + \mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{R}_i}\d\mat{\beta}) \\ + &\hspace{3em} -\frac{\tilde{s}}{2}2\sum_{i = 1}^n \tr\!\Big(\t{\mat{f}_{y_i}}\t{\mat{\beta }}(\mat{R}_i \widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1} + \widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i - \widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i \widetilde{\mat{\Delta}}_1^{-1})\d\mat{\alpha} \\ + &\hspace{3em} \hspace{4.7em} + \mat{f}_{y_i} \t{\mat{\alpha}}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1})\d\mat{\beta }\Big) \\ +% + &= \sum_{i = 1}^n \tr\bigg(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\Big( + -p q \tilde{s}^{-1} \mat{R}_i + p \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1} + q \widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i + \tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)\mat{R}_i \\ + &\hspace{3em} \hspace{4.7em} - \tilde{s}(\mat{R}_i \widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1} + \widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i - \widetilde{\mat{\Delta}}_2^{-1} \mat{R}_i \widetilde{\mat{\Delta}}_1^{-1}) + \Big)\d\mat{\alpha}\bigg) \\ + &\hspace{3em}+ \sum_{i = 1}^n \tr\bigg(\mat{f}_{y_i}\t{\mat{\alpha}}\Big( + -p q \tilde{s}^{-1} \t{\mat{R}_i} + p \widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + q \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1} + \tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)\t{\mat{R}_i} \\ + &\hspace{3em}\hspace{3em} \hspace{4.7em} - \tilde{s}(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}\t{\mat{R}_i} + \t{\mat{R}_i}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2\widetilde{\mat{\Delta}}_2^{-1} - \widetilde{\mat{\Delta}}_1^{-1} \t{\mat{R}_i} \widetilde{\mat{\Delta}}_2^{-1}) + \Big)\d\mat{\beta}\bigg). +\end{align*} +Observe that the bracketed expressions before $\d\mat{\alpha}$ and $\d\mat{\beta}$ are transposed of each other. Lets denote the expression for $\d\mat{\alpha}$ as $\mat{G}_i$ which has the form +\begin{displaymath} + \mat{G}_i + = (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\mat{R}_i + + (q\mat{I}_p - \tilde{s}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2)\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i + + \mat{R}_i\widetilde{\mat{\Delta}}_1^{-1}(p\mat{I}_q - \tilde{s}\mat{S}_1\widetilde{\mat{\Delta}}_1^{-1}) + + \tilde{s}\widetilde{\mat{\Delta}}_2^{-1}\mat{R}_i\widetilde{\mat{\Delta}}_1^{-1} +\end{displaymath} +and with $\mathcal{G}$ the order 3 tensor stacking the $\mat{G}_i$'s such that the first mode indexes the observation +\begin{displaymath} + \ten{G} + = (\tr(\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1) - p q \tilde{s}^{-1})\ten{R} + + \ten{R}\ttm[2](q\mat{I}_p - \tilde{s}\widetilde{\mat{\Delta}}_2^{-1}\mat{S}_2)\widetilde{\mat{\Delta}}_2^{-1} + + \ten{R}\ttm[3](p\mat{I}_q - \tilde{s}\widetilde{\mat{\Delta}}_1^{-1}\mat{S}_1)\widetilde{\mat{\Delta}}_1^{-1} + + \tilde{s}\ten{R}\ttm[2]\widetilde{\mat{\Delta}}_2^{-1}\ttm[3]\widetilde{\mat{\Delta}}_1^{-1} +\end{displaymath} +This leads to the following form of the differential of $\tilde{l}$ given by +\begin{displaymath} + \d\tilde{l}(\mat{\alpha}, \mat{\beta}) + = \sum_{i = 1}^n \tr(\t{\mat{f}_{y_i}}\t{\mat{\beta}}\mat{G}_i\d\mat{\alpha}) + + \sum_{i = 1}^n \tr(\mat{f}_{y_i}\t{\mat{\alpha}}\t{\mat{G}_i}\d\mat{\beta}) +\end{displaymath} +and therefore the gradients +\begin{align*} + \nabla_{\mat{\alpha}}\tilde{l}(\mat{\alpha}, \mat{\beta}) &= \sum_{i = 1}^n \t{\mat{G}_i}\mat{\beta}\mat{f}_{y_i} + = \ten{G}_{(3)}\t{(\ten{F}\ttm[2]\beta)_{(3)}}, \\ + \nabla_{\mat{\beta}} \tilde{l}(\mat{\alpha}, \mat{\beta}) &= \sum_{i = 1}^n \mat{G}_i\mat{\alpha}\t{\mat{f}_{y_i}} + = \ten{G}_{(2)}\t{(\ten{F}\ttm[3]\beta)_{(2)}}. +\end{align*} + +\todo{check the tensor version of the gradient!!!} + +\newpage + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% Numerical Examples %%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Numerical Examples} +The first example (which by it self is \emph{not} exemplary) is the estimation with parameters $n = 200$, $p = 11$, $q = 5$, $k = 14$ and $r = 9$. The ``true'' matrices $\mat\alpha$, $\mat\beta$ generated by sampling there elements i.i.d. standard normal like the responses $y$. Then, for each observation, $\mat{f}_y$ is computed as $\sin(s_{i, j} y + o_{i j})$ \todo{ properly describe} to fill the elements of $\mat{f}_y$. Then the $\mat{X}$'s are samples as +\begin{displaymath} + \mat{X} = \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}, \qquad \vec{\mat{\epsilon}} \sim \mathbb{N}_{p q}(\mat{0}, \mat{\Delta}) +\end{displaymath} +where $\mat{\Delta}_{i j} = 0.5^{|i - j|}$ for $i, j = 1, ..., p q$. + +\begin{figure} + \centering + \includegraphics{loss_Ex01.png} +\end{figure} +\begin{figure} + \centering + \includegraphics{estimates_Ex01.png} +\end{figure} +\begin{figure} + \centering + \includegraphics{Delta_Ex01.png} +\end{figure} +\begin{figure} + \centering + \includegraphics{hist_Ex01.png} +\end{figure} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% Bib and Index %%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\printindex +\nocite{*} +\printbibliography + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% Appendix %%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\appendix +\section{Matrix Differential Rules} +Let $\mat A$ be a square matrix (and invertible if needed) and $|.|$ stands for the determinant +\begin{align*} + \d\log\mat A &= \frac{1}{|\mat A|}\d\mat{A} \\ + \d|\mat A| &= |\mat A|\tr \mat{A}^{-1}\d\mat A \\ + \d\log|\mat A| &= \tr\mat{A}^{-1}\d\mat A \\ + \d\mat{X}^{-1} &= -\mat{X}^{-1}(\d\mat{X})\mat{X}^{-1} +\end{align*} + +\section{Useful Matrix Identities} +In this section we summarize a few useful matrix identities, for more details see for example \cite{MatrixAlgebra-AbadirMagnus2005}. + +For two matrices $\mat A$ of dimensions $q\times r$ and $\mat B$ of dimensions $p\times k$ holds +\begin{equation}\label{eq:vecKron} + \vec(\mat A\kron\mat B) = (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p)(\vec\mat A\kron\vec\mat B). +\end{equation} + +Let $\mat A$ be a $p\times p$ dimensional non-singular matrix. Furthermore, let $\mat a, \mat b$ be $p$ vectors such that $\t{\mat b}A^{-1}\mat a\neq -1$, then +\begin{displaymath} + (\mat A + \mat a\t{\mat b})^{-1} = \mat{A}^{-1} - \frac{1}{1 + \t{\mat b}A^{-1}\mat a}\mat{A}^{-1}\mat{a}\t{\mat{b}}\mat{A}^{-1} +\end{displaymath} +as well as +\begin{displaymath} + \det(\mat A + \mat a\t{\mat b}) = \det(\mat A)(1 + \t{\mat b}{\mat A}^{-1}\mat a) +\end{displaymath} +which even holds in the case $\t{\mat b}A^{-1}\mat a = -1$. This is known as Sylvester's determinant theorem. + + +\section{Commutation Matrix and Permutation Identities} +\begin{center} + Note: In this section we use 0-indexing for the sake of simplicity! +\end{center} +In this section we summarize relations between the commutation matrix and corresponding permutation. We also list some extensions to ``simplify'' or represent some term. This is mostly intended for implementation purposes and understanding of terms occurring in the computations. + +Let $\mat A$ be an arbitrary $p\times q$ matrix. The permutation matrix $\mat K_{p, q}$ satisfies +\begin{displaymath} + \mat{K}_{p, q}\vec{\mat{A}} = \vec{\t{\mat{A}}} \quad\Leftrightarrow\quad (\vec{\mat{A}})_{\pi_{p, q}(i)} = (\vec{\t{\mat{A}}})_{i}, \quad\text{for } i = 0, ..., p q - 1 +\end{displaymath} +where $\pi_{p, q}$ is a permutation of the indices $i = 0, ..., p q - 1$ such that +\begin{displaymath} + \pi_{p, q}(i + j p) = j + i q, \quad\text{for }i = 0, ..., p - 1; j = 0, ..., q - 1. +\end{displaymath} + +\begin{table}[!htp] + \centering + \begin{minipage}{0.8\textwidth} + \centering + \begin{tabular}{l c l} + $\mat{K}_{p, q}$ & $\hat{=}$ & $\pi_{p, q}(i + j p) = j + i q$ \\ + $\mat{I}_r\kron\mat{K}_{p, q}$ & $\hat{=}$ & $\tilde{\pi}_{p, q, r}(i + j p + k p q) = j + i q + k p q$ \\ + $\mat{K}_{p, q}\kron\mat{I}_r$ & $\hat{=}$ & $\hat{\pi}_{p, q, r}(i + j p + k p q) = r(j + i q) + k$ + \end{tabular} + \caption{\label{tab:commutation-permutation}Commutation matrix terms and corresponding permutations. Indices are all 0-indexed with the ranges; $i = 0, ..., p - 1$, $j = 0, ..., q - 1$ and $k = 0, ..., r - 1$.} + \end{minipage} +\end{table} + + + +\section{Matrix and Tensor Operations} + +The \emph{Kronecker product}\index{Operations!Kronecker@$\kron$ Kronecker product} is denoted as $\kron$ and the \emph{Hadamard product} uses the symbol $\circ$. We also need the \emph{Khatri-Rao product}\index{Operations!KhatriRao@$\hada$ Khatri-Rao product} +$\hada$ as well as the \emph{Transposed Khatri-Rao product} $\odot_t$ (or \emph{Face-Splitting product}). There is also the \emph{$n$-mode Tensor Matrix Product}\index{Operations!ttm@$\ttm[n]$ $n$-mode tensor product} denoted by $\ttm[n]$ in conjunction with the \emph{$n$-mode Matricization} of a Tensor $\mat{T}$ written as $\mat{T}_{(n)}$, which is a matrix. See below for definitions and examples of these operations.\todo{ Definitions and Examples} + +\todo{ resolve confusion between Khatri-Rao, Column-wise Kronecker / Khatri-Rao, Row-wise Kronecker / Khatri-Rao, Face-Splitting Product, .... Yes, its a mess.} +\paragraph{Kronecker Product $\kron$:} +\paragraph{Khatri-Rao Product $\hada$:} +\paragraph{Transposed Khatri-Rao Product $\odot_t$:} This is also known as the Face-Splitting Product and is the row-wise Kronecker product of two matrices. If relates to the Column-wise Kronecker Product through +\begin{displaymath} + \t{(\mat{A}\odot_{t}\mat{B})} = \t{\mat{A}}\hada\t{\mat{B}} +\end{displaymath} + +\paragraph{$n$-mode unfolding:} \emph{Unfolding}, also known as \emph{flattening} or \emph{matricization}, is an reshaping of a tensor into a matrix with rearrangement of the elements such that mode $n$ corresponds to columns of the result matrix and all other modes are vectorized in the rows. Let $\ten{T}$ be a tensor of order $m$ with dimensions $t_1\times ... \times t_n\times ... \times t_m$ and elements indexed by $(i_1, ..., i_n, ..., i_m)$. The $n$-mode flattening, denoted $\ten{T}_{(n)}$, is defined as a $(t_n, \prod_{k\neq n}t_k)$ matrix with element indices $(i_n, j)$ such that $j = \sum_{k = 1, k\neq n}^m i_k\prod_{l = 1, l\neq n}^{k - 1}t_l$. +\todo{ give an example!} + +\paragraph{$n$-mode Tensor Product $\ttm[n]$:} +The \emph{$n$-mode tensor product} $\ttm[n]$ between a tensor $\mat{T}$ of order $m$ with dimensions $t_1\times t_2\times ... \times t_n\times ... \times t_m$ and a $p\times t_n$ matrix $\mat{M}$ is defined element-wise as +\begin{displaymath} + (\ten{T}\ttm[n] \mat{M})_{i_1, ..., i_{n-1}, j, i_{n+1}, ..., i_m} = \sum_{k = 1}^{t_n} \ten{T}_{i_1, ..., i_{n-1}, k, i_{n+1}, ..., i_m} \mat{M}_{j, k} +\end{displaymath} +where $i_1, ..., i_{n-1}, i_{n+1}, ..., i_m$ run from $1$ to $t_1, ..., t_{n-1}, t_{n+1}, ..., t_m$, respectively. Furthermore, the $n$-th fiber index $j$ of the product ranges from $1$ to $p$. This gives a new tensor $\mat{T}\ttm[n]\mat{M}$ of order $m$ with dimensions $t_1\times t_2\times ... \times p\times ... \times t_m$. + +\begin{example}[Matrix Multiplication Analogs] + Let $\mat{A}$, $\mat{B}$ be two matrices with dimensions $t_1\times t_2$ and $p\times q$, respectively. Then $\mat{A}$ is also a tensor of order $2$, now the $1$-mode and $2$-mode products are element wise given by + \begin{align*} + (\mat{A}\ttm[1] \mat{B})_{i,j} &= \sum_{l = 1}^{t_1} \mat{A}_{l,j}\mat{B}_{i,l} + = (\mat{B}\mat{A})_{i,j} + & \text{for }t_1 = q, \\ + (\mat{A}\ttm[2] \mat{B})_{i,j} &= \sum_{l = 1}^{t_2} \mat{A}_{i,l}\mat{B}_{j,l} + = (\mat{A}\t{\mat{B}})_{i,j} = \t{(\mat{B}\t{\mat{A}})}_{i,j} + & \text{for }t_2 = q. + \end{align*} + In other words, the $1$-mode product equals $\mat{A}\ttm[1] \mat{B} = \mat{B}\mat{A}$ and the $2$-mode is $\mat{A}\ttm[2] \mat{B} = \t{(\mat{B}\t{\mat{A}})}$ in the case of the tensor $\mat{A}$ being a matrix. +\end{example} + +\begin{example}[Order Three Analogs] + Let $\mat{A}$ be a tensor of the form $t_1\times t_2\times t_3$ and $\mat{B}$ a matrix of dimensions $p\times q$, then the $n$-mode products have the following look + \begin{align*} + (\mat{A}\ttm[1]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_1} \mat{A}_{l,j,k}\mat{B}_{i,l} & \text{for }t_1 = q, \\ + (\mat{A}\ttm[2]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_2} \mat{A}_{i,l,k}\mat{B}_{j,l} \equiv (\mat{B}\mat{A}_{i,:,:})_{j,k} & \text{for }t_2 = q, \\ + (\mat{A}\ttm[3]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_3} \mat{A}_{i,j,l}\mat{B}_{k,l} \equiv \t{(\mat{B}\t{\mat{A}_{i,:,:}})}_{j,k} & \text{for }t_3 = q. + \end{align*} +\end{example} + +Letting $\ten{F}$ be the $3$-tensor of dimensions $n\times k\times r$ such that $\ten{F}_{i,:,:} = \mat{f}_{y_i}$, then +\begin{displaymath} + \mat{\beta}\mat{f}_{y_i}\t{\mat{\alpha}} = (\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha})_{i,:,:} +\end{displaymath} +or in other words, the $i$-th slice of the tensor product $\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha}$ contains $\mat{\beta}\mat{f}_{y_i}\t{\mat{\alpha}}$ for $i = 1, ..., n$. +Another analog way of writing this is +\begin{displaymath} + (\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha})_{(1)} = \mathbb{F}_{y}(\t{\mat{\alpha}}\kron\t{\mat{\beta}}) +\end{displaymath} + +\section{Equivalencies} +In this section we give a short summary of alternative but equivalent operations. +Using the notation $\widehat{=}$ to indicate that two expressions are identical in the sense that they contain the same element in the same order but may have different dimensions. Meaning, when vectorizing ether side of $\widehat{=}$, they are equal ($\mat{A}\widehat{=}\mat{B}\ :\Leftrightarrow\ \vec{\mat{A}} = \vec{\mat{B}}$). + +Therefore, we use $\mat{A}, \mat{B}, \mat{X}, \mat{F}, \mat{R}, ...$ for matrices. 3-Tensors are written as $\ten{A}, \ten{B}, \ten{T}, \ten{X}, \ten{F}, \ten{R}, ...$. + +\begin{align*} + \ten{T}\ttm[3]\mat{A}\ &{\widehat{=}}\ \mat{T}\t{\mat A} & \ten{T}(n, p, q)\ \widehat{=}\ \mat{T}(n p, q), \mat{A}(p, q) \\ + \ten{T}\ttm[2]\mat{B}\ &{\widehat{=}}\ \mat{B}\ten{T}_{(2)} & \ten{T}(n, p, q), \ten{T}_{(2)}(p, n q), \mat{B}(q, p) +\end{align*} + +\section{Matrix Valued Normal Distribution} +A random variable $\mat{X}$ of dimensions $p\times q$ is \emph{Matrix-Valued Normal Distribution}, denoted +\begin{displaymath} + \mat{X}\sim\mathcal{MN}_{p\times q}(\mat{\mu}, \mat{\Delta}_2, \mat{\Delta}_1), +\end{displaymath} +if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otimes\mat\Delta_2)$. Note the order of the covariance matrices $\mat\Delta_1, \mat\Delta_2$. Its density is given by +\begin{displaymath} + f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right). +\end{displaymath} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% Reference Summaries %%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\section{Reference Summaries} +This section contains short summaries of the main references with each sub-section concerning one paper. + +\subsection{} + +\subsection{Generalized Tensor Decomposition With Features on Multiple Modes} +The \cite{TensorDecomp-HuLeeWang2022} paper proposes a multi-linear conditional mean model for a constraint rank tensor decomposition. Let the responses $\ten{Y}\in\mathbb{R}^{d_1\times ... \times\d_K}$ be an order $K$ tensor. Associated with each mode $k\in[K]$ they assume feature matrices $\mat{X}_k\in\mathbb{R}^{d_k\times p_k}$. Now, they assume that conditional on the feature matrices $\mat{X}_k$ the entries of the tensor $\ten{Y}$ are independent realizations. The rank constraint is specified through $\mat{r} = (r_1, ..., r_K)$, then the model is given by +\begin{displaymath} + \E(\ten{Y} | \mat{X}_1, ..., \mat{X}_K) = f(\ten{C}\times\{ \mat{X}_1\mat{M}_1, ..., \mat{X}_K\mat{M}_K \}),\qquad \t{\mat{M}_k}\mat{M}_k = \mat{I}_{r_k}\ \forall k\in[K]. +\end{displaymath} +The order $K$ tensor $\ten{C}\in\mathbb{R}^{r_1\times...\times r_K}$ is an unknown full-rank core tensor and the matrices $\mat{M}_k\in\mathbb{R}^{p_k\times r_k}$ are unknown factor matrices. The function $f$ is applied element wise and serves as the link function based on the assumed distribution family of the tensor entries. Finally, the operation $\times$ denotes the tensor-by-matrix product using a short hand +\begin{displaymath} + \ten{C}\times\{ \mat{X}_1\mat{M}_1, ..., \mat{X}_K\mat{M}_K \} + = \ten{C}\ttm[1]\mat{X}_1\mat{M}_1\ ...\ttm[K]\mat{X}_K\mat{M}_K +\end{displaymath} +with $\ttm[k]$ denoting the $k$-mode tensor matrix product. + +The algorithm for estimation of $\ten{C}$ and $\mat{M}_1, ..., \mat{M}_K$ assumes the individual conditional entries of $\ten{Y}$ to be independent and to follow a generalized linear model with link function $f$. The proposed algorithm is an iterative algorithm for minimizing the negative log-likelihood +\begin{displaymath} + l(\ten{C}, \mat{M}_1, ..., \mat{M}_K) = \langle \ten{Y}, \Theta \rangle - \sum_{i_1, ..., i_K} b(\Theta_{i_1, ..., i_K}), \qquad \Theta = \ten{C}\times\{ \mat{X}_1\mat{M}_1, ..., \mat{X}_K\mat{M}_K \} +\end{displaymath} +where $b = f'$ it the derivative of the canonical link function $f$ in the generalized linear model the conditioned entries of $\ten{Y}$ follow. The algorithm utilizes the higher-order SVD (HOSVD) to enforce the rank-constraint. + +The main benefit is that this approach generalizes well to a multitude of different structured data sets. + +\todo{ how does this relate to the $\mat{X} = \mat{\mu} + \mat{\beta}\mat{f}_y\t{\mat{\alpha}} + \mat{\epsilon}$ model.} + +\end{document} diff --git a/tensorPredictors/src/Makevars b/tensorPredictors/src/Makevars new file mode 100644 index 0000000..1971f9a --- /dev/null +++ b/tensorPredictors/src/Makevars @@ -0,0 +1,2 @@ + +PKG_LIBS = $(BLAS_LIBS) $(FLIBS)