fix: stability of gmlm_ising and added slicing,

add: higher-order approx.kron,
add: paper.tex - simulation plots and tensor normal MLE estimation
This commit is contained in:
Daniel Kapla 2023-11-21 12:21:43 +01:00
parent 6477d78190
commit 399f878fbb
8 changed files with 830 additions and 401 deletions

View File

@ -487,15 +487,15 @@ Equation \eqref{eq:eta2} is underdetermined since $\t{(\mat{T}_2\pinv{\mat{D}_p}
\todo{Maybe already here introduce the ``constraint'' set of $\Omega$'s allowed as $\{ \Omega\in\mathbb{R}_{++}^{p\times p} : \vec{\Omega} = \t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p})}\vec{\Omega} \}$} \todo{Maybe already here introduce the ``constraint'' set of $\Omega$'s allowed as $\{ \Omega\in\mathbb{R}_{++}^{p\times p} : \vec{\Omega} = \t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p})}\vec{\Omega} \}$}
In a classical \emph{generalized linear model} (GLM), the link function connecting the natural parameters to the expectation of the sufficient statistic $\mat{\eta}_y = \mat{g}(\E[\mat{t}(\ten{X}) \mid Y = y])$ is invertible. Such a link may not exist in our setting, but for our purpose what we call the ``inverse'' link suffices. The ``inverse'' link $\widetilde{\mat{g}}$ exists as the natural parameters fully describe the distribution. As in the non-minimal formulation \eqref{eq:quadratic-exp-fam}, we define the ``inverse'' link through its tensor valued components In a classical \emph{generalized linear model} (GLM), the link function connecting the natural parameters to the expectation of the sufficient statistic $\mat{\eta}_y = \mat{g}(\E[\mat{t}(\ten{X}) \mid Y = y])$ is invertible. Such a link may not exist in our setting, but for our purpose what we call the ``inverse'' link suffices. The ``inverse'' link $\widetilde{\mat{g}}$ exists as the natural parameters fully describe the distribution. As in the non-minimal formulation \eqref{eq:quadratic-exp-fam}, we define the ``inverse'' link through its tensor valued components
\begin{align*} \begin{align}
\ten{g}_1(\mat{\eta}_y) &= \E[\ten{X} \mid Y = y], \\ \ten{g}_1(\mat{\eta}_y) &= \E[\ten{X} \mid Y = y], \label{eq:inv-link1}\\
\ten{g}_2(\mat{\eta}_y) &= \E[\ten{X}\circ\ten{X} \mid Y = y] \ten{g}_2(\mat{\eta}_y) &= \E[\ten{X}\circ\ten{X} \mid Y = y] \label{eq:inv-link2}
\end{align*} \end{align}
as $\widetilde{\mat{g}}(\mat{\eta}_y) = (\vec\ten{g}_1(\mat{\eta}_y), \vec\ten{g}_2(\mat{\eta}_y))$. as $\widetilde{\mat{g}}(\mat{\eta}_y) = (\vec\ten{g}_1(\mat{\eta}_y), \vec\ten{g}_2(\mat{\eta}_y))$.
Under the quadratic exponential family model \eqref{eq:quadratic-exp-fam}, a sufficient reduction for the regression of $Y$ on $\ten{X}$ is given in \cref{thm:sdr}. Under the quadratic exponential family model \eqref{eq:quadratic-exp-fam}, a sufficient reduction for the regression of $Y$ on $\ten{X}$ is given in \cref{thm:sdr}.
\begin{theorem}[SDR]\label{thm:sdr} \begin{theorem}[SDR]\label{thm:sdr}
A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:quadratic-exp-fam} with natural parameters \eqref{eq:eta1} and \eqref{eq:eta2} is given by A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:quadratic-exp-fam} with natural parameters modeled \eqref{eq:eta1} and \eqref{eq:eta2} is given by
\begin{align}\label{eq:sdr} \begin{align}\label{eq:sdr}
\ten{R}(\ten{X}) \ten{R}(\ten{X})
% &= (\ten{X} - \E\ten{X})\times\{ \t{\mat{\beta}_1}, \ldots, \t{\mat{\beta}_r} \}. % &= (\ten{X} - \E\ten{X})\times\{ \t{\mat{\beta}_1}, \ldots, \t{\mat{\beta}_r} \}.
@ -551,7 +551,7 @@ The maximum likelihood estimate of $\mat{\theta}_0$ is the solution to the optim
\end{equation} \end{equation}
with $\hat{\mat{\theta}}_n = (\vec\widehat{\overline{\ten{\eta}}}, \vec\widehat{\mat{B}}, \vech\widetilde{\mat{\Omega}})$ where $\widehat{\mat{B}} = \bigkron_{k = r}^{1}\widehat{\mat{\beta}}_k$ and $\widehat{\mat{\Omega}} = \bigkron_{k = r}^{1}\widehat{\mat{\Omega}}_k$. with $\hat{\mat{\theta}}_n = (\vec\widehat{\overline{\ten{\eta}}}, \vec\widehat{\mat{B}}, \vech\widetilde{\mat{\Omega}})$ where $\widehat{\mat{B}} = \bigkron_{k = r}^{1}\widehat{\mat{\beta}}_k$ and $\widehat{\mat{\Omega}} = \bigkron_{k = r}^{1}\widehat{\mat{\Omega}}_k$.
A straightforward and general method for parameter estimation is \emph{gradient descent}. For pure algorithmic speedup, by only changing the update rule but \emph{not} the gradient denoted $\nabla_{\mat{\theta}}l_n$ of the objective function, we use an adapted version of \emph{Nesterov Accelerated Gradient} descent (NAG) originaly described in \textcite{Nesterov1983} with an internal line search to determine the step size in each iteration, as described in \cref{alg:NAGD}. For applying gradient based optimization, we compute the gradients of $l_n$ in \cref{thm:grad}. A straightforward and general method for parameter estimation is \emph{gradient descent}. For applying gradient based optimization, we compute the gradients of $l_n$ in \cref{thm:grad}.
\begin{theorem}\label{thm:grad} \begin{theorem}\label{thm:grad}
For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the log-likelihood has the form \eqref{eq:log-likelihood} with $\mat{\theta}$ being the collection of all GMLM parameters $\overline{\ten{\eta}}$, ${\mat{B}} = \bigkron_{k = r}^{1}{\mat{\beta}}_k$ and ${\mat{\Omega}} = \bigkron_{k = r}^{1}{\mat{\Omega}}_k$ for $k = 1, ..., r$. Furthermore, let $\ten{G}_2(\mat{\eta}_y)$ be a tensor of dimensions $p_1, \ldots, p_r$ such that For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the log-likelihood has the form \eqref{eq:log-likelihood} with $\mat{\theta}$ being the collection of all GMLM parameters $\overline{\ten{\eta}}$, ${\mat{B}} = \bigkron_{k = r}^{1}{\mat{\beta}}_k$ and ${\mat{\Omega}} = \bigkron_{k = r}^{1}{\mat{\Omega}}_k$ for $k = 1, ..., r$. Furthermore, let $\ten{G}_2(\mat{\eta}_y)$ be a tensor of dimensions $p_1, \ldots, p_r$ such that
@ -569,43 +569,84 @@ A straightforward and general method for parameter estimation is \emph{gradient
If $\mat{T}_2$ is the identity matrix $\mat{I}_{p(p + 1) / 2}$, then $\ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y)$. If $\mat{T}_2$ is the identity matrix $\mat{I}_{p(p + 1) / 2}$, then $\ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y)$.
\end{theorem} \end{theorem}
\begin{algorithm}[hpt!] Although the general case of any GMLM model can be fitted via gradient descent using \cref{thm:grad}, this may be very inefficient. In case of the tensor normal distribution an iterative cyclic updating scheme can be derived by setting the partial gradients of \cref{thm:grad} to zero. Described in more details in \cref{sec:tensor_normal_estimation}, this method has much faster convergence, is stabel and does not requires any hyper parameters. On the other hand, the Ising model does not allow such a scheme. There we need to use a gradient based method, which is the subject of \cref{sec:ising_estimation}.
\caption{\label{alg:NAGD}Nesterov Accelerated Gradient Descent}
\begin{algorithmic}[1]
\State Objective: $l_n(\mat{\theta})$
\State Arguments: Order $r + 1$ tensors $\ten{X}$, $\ten{F}$ \todo{ask Effie about input description}
\State Initialize: Parameters $\mat{\theta}^{(0)}$, $0 < \delta^{(1)}$ and $0 < \gamma < 1$
\\
\State $t \leftarrow 1$
\Comment{step counter}
\State $\mat{\theta}^{(1)} \leftarrow \mat{\theta}^{(0)}$
\Comment{artificial first step}
\State $(m^{(0)}, m^{(1)}) \leftarrow (0, 1)$
\Comment{momentum extrapolation weights}
\\
\Repeat \Comment{repeat untill convergence}
\State $\mat{M} \leftarrow \mat{\theta}^{(t)} + \frac{m^{(t - 1)} - 1}{m^{(t)}}(\mat{\theta}^{(t)} - \mat{\theta}^{(t - 1)})$ \Comment{momentum extrapolation}
\For{$\delta = \gamma^{-1}\delta^{(t)}, \delta^{(t)}, \gamma\delta^{(t)}, \gamma^2\delta^{(t)}, \ldots$} \Comment{line search}
\State $\mat{\theta} \leftarrow \mat{M} + \delta \nabla l_n(\mat{M})$
\If{$l_n(\mat{\theta}) \leq l_n(\mat{\theta}^{(t - 1)}) - \delta \|\nabla_{\mat{\theta}} l_n(\mat{M})\|_F^2$} \Comment{Armijo condition}
\State $\mat{\theta}^{(t + 1)} \leftarrow \mat{\theta}$
\State $\delta^{(t + 1)} \leftarrow \delta$
\Break
\EndIf
\EndFor
\State $m^{(t + 1)} \leftarrow \frac{1 + \sqrt{1 + (2 m^{(t)})^2}}{2}$ \Comment{update extrapolation weights}
\State $t \leftarrow t + 1$
\Until converged
\end{algorithmic}
\end{algorithm}
\todo{Update the following, especially check that $\ten{X}$ needs to be center (as in the current version implicitly assumed)} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\cref{alg:NAGD} requires initial parameter estimates $\mat{\theta}^{(0)}$. Therefore, we choose a simple heuristic approach by starting with $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$ and $\overline{\ten{\eta}}_1^{(0)}$ dependent on the particular distribution. For example for a Tensor Normal distribution the sample mean is a good choice while for the Ising model zero initialization has nice properties. The interesting part are the multi-linear predictors $\mat{\beta}_k$. Therefore, let $\mat{\Sigma}_k = \frac{q_k}{n q}\sum_{i = 1}^n {\ten{F}_{y_i}}_{(k)}\t{{\ten{F}_{y_i}}_{(k)}}$ and $\mat{\Delta}_k = \frac{p_k}{n p}\sum_{i = 1}^n {\ten{X}_{i}}_{(k)}\t{{\ten{X}_{i}}_{(k)}}$ mode wise sample covariance estimates. For both we take the $s_k = \min(p_k, q_k)$ order SVD approximation as $\mat{\Delta}_k \approx \mat{U}_k\mat{D}_k\t{\mat{U}_k}$ and $\mat{\Sigma}_k \approx \mat{V}_k\mat{S}_k\t{\mat{V}_k}$. The approximate SVD gives us the $s_k$ first singular vectors with dimensions $\mat{U}_k(p_k\times s_k), \mat{D}_k(s_k\times s_k)$ and $\mat{V}_k(q_k\times s_k), \mat{S}_k(s_k\times s_k)$. Then we initialize \subsection{Tensor Normal}\label{sec:tensor_normal_estimation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
With $\ten{X}\mid Y = y$ following a tensor normal distribution, its density, with mean $\ten{\mu}_y$ and covariance $\mat{\Sigma} = \bigkron_{k = r}^{1}\mat{\Sigma}_k$ is given by
\begin{displaymath} \begin{displaymath}
\mat{\beta}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\sqrt{\mat{S}_k}\t{\mat{V}_k}. f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p / 2}\prod_{k = 1}^{r}\det(\mat{\Sigma}_k)^{-p / 2 p_k}\exp\left( -\frac{1}{2}\left\langle\ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k = 1}^{r}\mat{\Sigma}_k^{-1} \right\rangle \right).
\end{displaymath}
We assume the distribution is non-degenerate which means that the covariances $\mat{\Sigma}_k$ are symmetric positive definite matrices. For the sake of simplicity, we w.l.o.g. assume $\ten{X}$ to have marginal expectation equal zero, $\E\ten{X} = 0$. Rewriting this in the quadratic exponential family form \eqref{eq:quadratic-exp-fam}, determines tha scaling constant $c = -1/2$, and give the relation to the GMLM parameters $\overline{\ten{\eta}}, \mat{\beta}_k$ and $\mat{\Omega}_k$, for $k = 1, \ldots, r$ as
\begin{displaymath}
\ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k, \qquad
\mat{\Omega}_k = \mat{\Sigma}_k^{-1}
\end{displaymath}
where we used that $\overline{\ten{\eta}} = 0$ due to $0 = \E\ten{X} = \E\E[\ten{X}\mid Y] = \E\ten{\mu}_Y$ in combination with $\E\ten{F}_Y = 0$. Additionally, all the $\mat{\Omega}_k$'s are symmetric positive definite, because the $\mat{\Sigma}_k$'s are. This lead to another simplification the matrix $\mat{T}_2$ from \eqref{eq:t-stat} is equal to the identity. This also means that the gradients of the log-likelihood $l_n$ in \cref{thm:grad} are simpler as well. We get
\begin{displaymath}
\ten{g}_1(\mat{\eta}_y) = \E[\ten{X}\mid Y = y] = \ten{\mu}_y, \qquad
\ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y) = \E[\ten{X}\circ\ten{X}\mid Y = y] \equiv \bigkron_{k = r}^1\mat{\Sigma}_k + (\vec{\ten{\mu}}_y)\t{(\vec{\ten{\mu}}_y)}.
\end{displaymath} \end{displaymath}
The default hyper-parameters in \cref{alg:NAGD} are set to $\delta^{(1)} = 1 / 100$ for the initial step size and the line search step parameter $\gamma = 2 / (1 + \sqrt{5})$ is the inverse golden ratio. In practice, we start the estimation process by demeaning a given data set $(\ten{X}_i, \ten{F}_{y_i})$ with $i = 1, \ldots, n$ observations. After demenaing, only the reduction matrices $\mat{\beta}_k$ and the scatter matrices $\mat{\Omega}_k$ need to be estimated. Then, to solve the optimization problem \eqref{eq:mle}, with $\overline{\ten{\eta}} = 0$ after demeaning, we need some initial values. For initial estimates $\hat{\mat{\beta}}_k^{(0)}$ we use a simple heuristic approach. First, we compute moment based mode-wise marginal covariance estimates $\widehat{\mat{\Sigma}}_k(\ten{X})$ and $\widehat{\mat{\Sigma}}_k(\ten{F}_Y)$ given by
\begin{displaymath}
\widehat{\mat{\Sigma}}_k(\ten{X}) = \frac{1}{n}\sum_{i = 1}^{n} (\ten{X}_i)_{(k)}\t{(\ten{X}_i)_{(k)}}, \qquad
\widehat{\mat{\Sigma}}_k(\ten{F}_Y) = \frac{1}{n}\sum_{i = 1}^{n} (\ten{F}_{y_i})_{(k)}\t{(\ten{F}_{y_i})_{(k)}}.
\end{displaymath}
Then, for every mode $k = 1, \ldots, r$, we compute the first $j = 1, \ldots, q_k$ eigen vectors $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{F}_Y))$ and eigen values $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$ of the marginal covariance estimates. Let,
\begin{align*}
\mat{U}_k &= (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))), \\
\mat{D}_k &= \diag(\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X}))\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_{Y})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))\mat{v}_{q_k}(\widehat{\mat{\Sigma}}_k(\ten{F}_{Y}))), \\
\mat{V}_k &= (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_Y), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{F}_Y)). \\
\end{align*}
The initial estimates are then set to
\begin{displaymath}
\hat{\mat{\beta}}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\t{\mat{V}_k}.
\end{displaymath}
The initial values of $\mat{\Omega}_k$ are set to the identity $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$.
Given any estimates $\hat{\mat{\beta}}_1, \ldots, \hat{\mat{\beta}}_r, \hat{\mat{\Omega}}_1, \ldots, \hat{\mat{\Omega}}_r$, we take the gradient $\nabla_{\mat{\beta}_j}$ of the log-likelihood $l_n$ in \eqref{eq:log-likelihood} of the tensor normal distributed from \cref{thm:grad} and keep all other parameters except $\mat{\beta}_j$ fixed. Then, $\nabla_{\mat{\beta}_j}l_n = 0$ has a closed form solution
\begin{equation}\label{eq:tensor_normal_beta_solution}
\t{\mat{\beta}_j} = \biggl(
\sum_{i = 1}^{n}
\Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\Omega}}_k^{-1}\hat{\mat{\beta}}_k \Bigr)_{(j)}
\t{\Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\beta}}_k \Bigr)_{(j)}}
\biggr)^{-1}
\biggl(
\sum_{i = 1}^{n}
\Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\beta}}_k \Bigr)_{(j)}
\t{(\ten{X}_{i})_{(j)}}
\biggr)
\hat{\mat{\Omega}}_j.
\end{equation}
For the scatter matrices $\mat{\Omega}_j$, we need to fudge a bit. Equating the partial gradient of the $j$'th scatter matrix $\mat{\Omega}_j$ in \cref{thm:grad} to zero, $\nabla_{\mat{\Omega}_j}l_n = 0$, gives a quadratic matrix equation. This is due to the dependence of $\ten{\mu}_y$ on $\mat{\Omega}_j$. In practive thoug, it is more stable, faster and equaly accurate to use mode-wise covariance estimates via the residuals
\begin{displaymath}
\hat{\ten{R}}_i = \ten{X}_i - \hat{\ten{\mu}}_{y_i} = \ten{X}_i - \ten{F}_{y_i}\mlm_{k = 1}^{r}\hat{\mat{\Omega}}_k^{-1}\hat{\mat{\beta}}_k.
\end{displaymath}
The estimates are computed via the inbetween values
\begin{displaymath}
\tilde{\mat{\Sigma}}_j = \sum_{i = 1}^n (\hat{\ten{R}}_i)_{(j)} \t{(\hat{\ten{R}}_i)_{(j)}}
\end{displaymath}
which only lacks scaling $\tilde{s}\tilde{\mat{\Sigma}}_j = \hat{\mat{\Omega}}_j^{-1}$. The scaling condition is that the mean squared error has to be equal to the trace of the covariance estimate,
\begin{displaymath}
\frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle = \tr\bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k^{-1} = \prod_{k = 1}^{r}\tr{\hat{\mat{\Omega}}_k^{-1}} = \tilde{s}^r\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k}.
\end{displaymath}
Solving for the scaling value gives
\begin{displaymath}
\tilde{s} = \biggl(\Bigl(\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k}\Bigr)^{-1}\frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle\biggr)^{1 / r}
\end{displaymath}
resulting in the estimates $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{-1}$.
Estimation is then performed by updating for $j = 1, \ldots, r$ the estimates $\hat{\mat{\beta}}_j$ via \eqref{eq:tensor_normal_beta_solution}, then we recompute the $\hat{\mat{\Omega}}_j$ estimates simultaniously keeping the $\hat{\mat{\beta}}_j$'s fixed. This procedure is repeated untill convergence. Convergence is very fast, experiments showed that convergence occures usualy in less than $10$ iterations.
A technical detail for numerical stability is to ensure that the scaled inbetween values $\tilde{s}\tilde{\mat{\Sigma}}_j$, assumed to be symmetric and positive definite, are well conditioned. Otherwise, the procedure can be numerically unstable by attempting to compute $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{-1}$. Therefore, we estimate the condition number of $\tilde{s}\tilde{\mat{\Sigma}}_j$ before computing the inverse. In case of being ill conditioned, we regularize and use $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j + 0.2 \lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)\mat{I}_{p_j})^{-1}$ instead, where $\lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)$ is the first (maximum) eigen value. Experiments showed that this regularization is usually required in the first few iterations only.
Furthermore, we may assume that the parameter space follows a more generel setting as in \cref{thm:param-manifold}. In this case, updating may produces estimates outside of the parameter space. A simple and efficient method is to project every updated estimate onto the corresponding manifold.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Ising Model}\label{sec:ising_estimation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage \newpage
@ -674,6 +715,8 @@ As a basis to ensure that the constraint parameter space $\Theta$ is a manifold,
then the constrained parameter space $\Theta = \mathbb{R}^p \times \manifold{K}_{\mat{B}}\times\manifold{CK}_{\mat{\Omega}}\subset\mathbb{R}^{p(p + 2 q + 3) / 2}$ is a smooth embedded manifold. then the constrained parameter space $\Theta = \mathbb{R}^p \times \manifold{K}_{\mat{B}}\times\manifold{CK}_{\mat{\Omega}}\subset\mathbb{R}^{p(p + 2 q + 3) / 2}$ is a smooth embedded manifold.
\end{theorem} \end{theorem}
\subsection{Matrix Manifolds}\label{sec:matrix-manifolds}
{\color{red} {\color{red}
Flexibility of Manifold Structure Flexibility of Manifold Structure
@ -716,55 +759,6 @@ It is not necessary to have a perfect maximizer, as long as the objective has fi
with asymptotic variance-covariance structure $\mat{\Sigma}_{\mat{\theta}_0}$ given in \eqref{eq:asymptotic-covariance-gmlm}. with asymptotic variance-covariance structure $\mat{\Sigma}_{\mat{\theta}_0}$ given in \eqref{eq:asymptotic-covariance-gmlm}.
\end{theorem} \end{theorem}
% \begin{theorem}[{\textcite[Thm~5.35]{asymStats-van_der_Vaart1998}}]
% Let $\{ p_{\mat{\theta}} : \mat{\theta}\in\Theta \}$ be a collection of identifiable subprobability\footnote{A subprobability measure is a measure $\nu$ for the measure space $(\mathfrak{X}, \nu)$ such that $\nu(\mathfrak{X}) \leq 1$.} densities such that $P_{\mat{\theta}_0}$ is a probability measure. Then $M(\mat{\theta}) = P_{\mat{\theta}_0}\log p_{\mat{\theta}} / p_{\mat{\theta}_0}$ attains its maximum uniquely at $\mat{\theta}_0$.
% \end{theorem}
% \begin{proof}[Proof of \cref{thm:asymptotic-normality-gmlm}]
% First we prove the consistency for which we apply \cref{thm:consistency-on-metric-spaces} and then prove that there
% Most parts of the proof are identical to the proof \textcite[Thm~3.1]{asymptoticMLE-BuraEtAl2018} with addaptations in a few places.
% {\color{green}Bla Bla Bla ...}
% \todo{regularity conditions!}
% By rewriting \eqref{eq:eta1} as
% \begin{displaymath}
% \mat{\eta}_{1y} =
% \vec{\overline{\ten{\eta}}} + \mat{B}\vec{\ten{F}_y} = (\vec{\overline{\ten{\eta}}}, \mat{B})\begin{pmatrix}
% 1 \\ \vec{\ten{F}_y}
% \end{pmatrix} = (\mat{I}_p\otimes \t{(1, \vec{\ten{F}_y})})\vec((\vec{\overline{\ten{\eta}}}, \mat{B}))
% \end{displaymath}
% the natural parameter $\mat{\eta}_y$ with components \eqref{eq:eta1}, \eqref{eq:eta2} can be written with $\mat{\theta} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vec{\mat{\Omega}})$ as
% \begin{displaymath}
% \mat{\eta}_y = \begin{pmatrix}
% \mat{I}_p\otimes \t{(1, \vec{\ten{F}_y})} & 0 \\
% 0 & c\t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}
% \end{pmatrix}\mat{\theta} = \mat{F}(y)\mat{\theta}
% \end{displaymath}
% Using the new form, let with $z = (\ten{X}, y)$
% \begin{displaymath}
% m_{\mat{\theta}}(z) = \langle \mat{F}(y)\mat{\theta}, \mat{t}(\ten{X}) \rangle - b(\mat{F}(y)\mat{\theta})
% \end{displaymath}
% which allows to write the log-likelihood \eqref{eq:log-likelihood}, scaled by $n^{-1}$, with $Z_i = (\ten{X}_i, Y_i)$, as
% \begin{displaymath}
% M_n(\mat{\theta}) = \frac{1}{n}\sum_{i = 1}^{n} m_{\mat{\theta}}(Z_i) = \frac{1}{n} l_n(\mat{\theta}).
% \end{displaymath}
% The strict convexity of $b$ ensures that $\mat{\theta}\mapsto m_{\mat{\theta}}(z)$ is strictly concave for every fixed $z$. Taking expectation preserves the concavity yielding $M(\mat{\theta}) = \E m_{\mat{\theta}}(Z)$ to be concave aswell. The identifiability \todo{???} of \todo{???} allows to apply \textcite[Thm~5.35]{asymStats-van_der_Vaart1998} which gives for the family of conditional densities $\{ f_{\mat{\theta}}(\,.\mid Y) : \mat{\theta}\in\Theta \}$ that
% \begin{displaymath}
% \E[m_{\mat{\theta}}(Z)\mid Y] \leq \E[m_{\mat{\theta}_0}(Z)\mid Y].
% \end{displaymath}
% Taking the expectation with respect to $Y$ gives $M(\mat{\theta}) = \E[m_{\mat{\theta}}(Z)] \leq \E[m_{\mat{\theta}_0}(Z)] = M(\mat{\theta}_0)$ for every $\mat{\theta}$. Now, assume their exists a $\mat{\theta}_1$ such that $M(\mat{\theta}_1) = M(\mat{\theta}_0)$, then $P(\mat{F}(Y)\mat{\theta}_1 = \mat{F}(Y)\mat{\theta}_0) = 1$ which means \todo{figure that out why} which contradicts \todo{$P(\mat{F}(Y)\mat{\theta} = \overline{\ten{\eta}}) = 1$ which is a violaten of an assumption.} Finally, with the integrability assumtion we can applying \textcite[Thm~A.1]{asymptoticMLE-BuraEtAl2018} which gives that the MLE $\hat{\mat{\theta}}_n$ of \eqref{eq:mle} exists and is unique.\todo{Fix that!!!}
% {\color{green}Foo Bar Baz ...}
% \todo{finish me!}
% \end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Asymptotic Normality} \subsection{Asymptotic Normality}
@ -814,38 +808,96 @@ The following is a reformulation of \textcite[Lemma~2.3]{asymptoticMLE-BuraEtAl2
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We compair our method with a few other methods for the tensor normal and the Ising model (inverse Ising problem). We compair our method with a few other methods for the tensor normal and the Ising model (inverse Ising problem).
\subsection{Tensor Normal Distribution} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The \emph{tensor normal distribution} $\mathcal{TN}(\ten{\mu}, \mat{\Sigma}_1, ..., \mat{\Sigma}_r)$ is a generalization of the \emph{matrix normal distribution} $\mathcal{MN}(\mat{\mu}, \mat{\Sigma}_1, \mat{\Sigma}_2)$. The density of the conditional tensor normal distribution $\ten{X}\mid Y = y$ is given by \subsection{Tensor Normal}
\begin{displaymath} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
f_{\mat{\theta}_y}(\ten{X}\mid Y = y) = (2\pi)^{-p/2}\prod_{k = 1}^r |\mat{\Sigma}_{k}|^{-p / 2 p_{k}}\exp\Big(
-\frac{1}{2}\Big\langle \ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k\in[r]}\mat{\Sigma}_{k} \Big\rangle
\Big)
\end{displaymath}
\subsection{Ising Model}
\begin{itemize}
\item[a] A simple setup with linear relation for
\item[b]
\item[c]
\item[d]
\item[e] Missspecified model .......
\end{itemize}
\begin{figure} \begin{figure}
\centering \centering
\begin{scaletikzpicturetowidth}{0.5\textwidth} \includegraphics[width = \textwidth]{plots/sim-normal.pdf}
\input{plots/sim-normal.tex} \caption{\label{fig:sim-normal}asknclknasknc}
\end{scaletikzpicturetowidth}
\caption{\label{fig:sim-normal}As a test case we show the $3$-tensor normal (generalization of the matrix normal)}
\end{figure} \end{figure}
\subsection{Matrix Normal and Ising Model}
If $\mat{X} \in \mathbb{R}^{p_1 \times p_2}$, then $\mat{\theta} = (\overline{\eta}, \mat{\beta}_1, \mat{\beta}_2, \mat{\Omega}_1, \mat{\Omega}_2)$ and $\mat{F}_y$ is also matrix valued. The conditional pdf of $\mat{X}\mid Y$ is %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{align*} \subsection{Ising Model}
f_{\mat{\theta}}(\mat{X}\mid Y = y) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
&= h(\mat{X})\exp(\langle\mat{X}, \mat{\eta}_1(\mat{\theta})\rangle + \langle\mat{X}\circ\mat{X}, \mat{\eta}_2(\mat{\theta})\rangle - b(\mat{\eta}_y(\mat{\theta}))) \\
&= h(\mat{X})\exp(\tr((\overline{\mat{\eta}} + \mat{\beta}_1\mat{F}_y\t{\mat{\beta}_2})\t{\mat{X}}) + \tr(\mat{\Omega}_1\mat{X}\mat{\Omega}_2\t{\mat{X}}) - b(\mat{\eta}_y(\mat{\theta}))). \begin{figure}
\end{align*} \centering
The MLE estimate $\hat{\mat{\theta}}_n = (\widehat{\overline{\mat{\eta}}}, \widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1, \widehat{\mat{\Omega}}_2\otimes \widehat{\mat{\Omega}}_2)$ is asymptotically normal and \includegraphics[]{plots/sim-ising.pdf}
\begin{displaymath} \caption{\label{fig:sim-ising}asknclknasknc}
\widehat{\ten{R}}(\mat{X}) = \t{(\widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1)}\vec(\mat{X} - \E\mat{X}) \equiv \t{\widehat{\mat{\beta}}_1}(\mat{X} - \E\mat{X})\widehat{\mat{\beta}}_2 \end{figure}
\end{displaymath}
is the MLE of the sufficient reduction $\ten{R}(\mat{X})$ of dimension $q_1 \times q_2 \leq p_1 \times p_2 $.\\[1.6em] \begin{table}
\begin{tabular}{c | ccc ccc c}
$n$ & GMLM & PCA & HOPCA & LPCA & CLPCA & TSIR & MGCCA \\
\hline
100 & {\bf 0.34} (0.14) & 0.90 (0.04) & 0.90 (0.05) & 0.94 (0.09) & 0.91 (0.03) & 0.48 (0.19) & 0.55 (0.13) \\
200 & {\bf 0.25} (0.11) & 0.90 (0.03) & 0.90 (0.03) & 0.96 (0.07) & 0.91 (0.02) & 0.38 (0.16) & 0.53 (0.10) \\
300 & {\bf 0.20} (0.09) & 0.89 (0.02) & 0.89 (0.02) & 0.97 (0.06) & 0.91 (0.02) & 0.29 (0.13) & 0.51 (0.11) \\
500 & {\bf 0.16} (0.07) & 0.90 (0.02) & 0.90 (0.02) & 0.98 (0.01) & 0.91 (0.01) & 0.23 (0.10) & 0.50 (0.08) \\
750 & {\bf 0.13} (0.05) & 0.90 (0.01) & 0.90 (0.01) & 0.98 (0.02) & 0.91 (0.01) & 0.23 (0.08) & 0.53 (0.06)
\end{tabular}
\caption{\label{tab:sim-ising}xyz uvw}
\end{table}
% The \emph{tensor normal distribution} $\mathcal{TN}(\ten{\mu}, \mat{\Sigma}_1, ..., \mat{\Sigma}_r)$ is a generalization of the \emph{matrix normal distribution} $\mathcal{MN}(\mat{\mu}, \mat{\Sigma}_1, \mat{\Sigma}_2)$. \todo{ref} The density of the conditional tensor normal distribution $\ten{X}\mid Y = y$ according to the quadratic exponential family \eqref{eq:quadratic-exp-fam} where only the first moment depends on $y$ is given by
% \begin{displaymath}
% f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p/2}\prod_{k = 1}^r |\mat{\Sigma}_{k}|^{-p / 2 p_{k}}\exp\Big(
% -\frac{1}{2}\Big\langle \ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k\in[r]}\mat{\Sigma}_{k} \Big\rangle
% \Big)
% \end{displaymath}
% Rewriting this in the form of an exponential family as in \eqref{eq:quadratic-exp-fam} allows to determin the natural parameter components $\mat{\eta}_{y1}$ and $\mat{\eta}_2$. Since a non-degenerate normal distribution requires the covariance matrices $\mat{\Sigma}_k$ to be symmetric positive definite the relation to the second moment natural parameter $\mat{\eta}_2$ simplifies as we can set $\mat{T}_2$ in \eqref{eq:eta2-manifold} to the identity. This then gives the relation to the natural parameters as in \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold} as
% \begin{displaymath}
% \mat{\eta}_{1y} = \vec\Bigl(\ten{\mu}_y\mlm_{k = 1}^{r}\mat{\Sigma}_k\Bigr), \qquad
% \mat{\eta}_2 = c\t{\mat{D}_p}\vec\bigkron_{k = r}^{1}\mat{\Sigma}_k^{-1}
% \end{displaymath}
% with scaling constant $c = -1 / 2$. Modeling the natural parameters as in \eqref{eq:eta1} and \eqref{eq:eta2} relates the mean $\ten{\mu}_y$ and the covariance matrices $\mat{\Sigma}_k$ of the tensor normal to the generalized multi-linear model parameter $\overline{\ten{\eta}}$ and $\mat{\beta}_k$, $\mat{\Omega}_k$, for $k = 1, \ldots, r$ through
% \begin{displaymath}
% \ten{\mu}_y = \Bigl(\overline{\ten{\eta}} + \ten{F}_y\mlm_{j = 1}^{r}\mat{\beta}_j\Bigr)\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}, \qquad \mat{\Omega}_k = \mat{\Sigma}_k^{-1}.
% \end{displaymath}
% This completely determines the tensor normal distribution given the GMLM parameter.
% To estimate the GMLM parameters $\mat{\theta} = (\overline{\ten{\eta}}, \mat{\beta}_1, \ldots, \mat{\beta}_r, \mat{\Omega}_1, \ldots\mat{\Omega}_r)$ given a data set $(\ten{X}_i, y_i)$ of $i = 1, \ldots, n$ observation we use the gradients provided by \cref{thm:grad}. It turns out that the equations $\nabla_{\overline{\ten{\eta}}}l_n = 0, \nabla_{\mat{\beta}_j}l_n = 0$ and $\nabla_{\mat{\Omega}_j}l_n = 0$, for $j = 1, \ldots, r$, can be solved for the differentiation variable assuming all the other parameter blocks to be constant. Centering the observed $\ten{X}$ leads to a cleaner formulation given by \todo{fix the following!}
% \begin{align*}
% \hat{\overline{\ten{\eta}}} &= \frac{1}{n}\sum_{i = 1}^{n} \ten{X}_i, \\
% \t{\hat{\mat{\beta}}_j} &= \mat{\Omega}_j \Bigl(\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\Omega}_k^{-1}\mat{\beta}_k\Bigr)_{(j)}\t{\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\beta}_k\Bigr)_{(j)}}\Bigr)^{-1}\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\beta}_k\Bigr)_{(j)}\ten{X}_{(j)}, \\
% \t{\Omega}_j &= scaling \frac{1}{n}
% \end{align*}
% This allows to use a \emph{block coordinate descent} method instead of gradient descent. This method keeps all but one parameter block fixed and optimized the objective for a single block. Given a closed form solution for the partial gradients, only a single update is required to solve the partial optimization problem. This means that the block coordinate descent method reduces to a cyclic updating. This not only converges very fast, it also does not require any hyper parameters.
% For this iterative scheme we do need some initial estimates. For those we
% \subsection{Matrix Normal and Ising Model}
% If $\mat{X} \in \mathbb{R}^{p_1 \times p_2}$, then $\mat{\theta} = (\overline{\eta}, \mat{\beta}_1, \mat{\beta}_2, \mat{\Omega}_1, \mat{\Omega}_2)$ and $\mat{F}_y$ is also matrix valued. The conditional pdf of $\mat{X}\mid Y$ is
% \begin{align*}
% f_{\mat{\theta}}(\mat{X}\mid Y = y)
% &= h(\mat{X})\exp(\langle\mat{X}, \mat{\eta}_1(\mat{\theta})\rangle + \langle\mat{X}\circ\mat{X}, \mat{\eta}_2(\mat{\theta})\rangle - b(\mat{\eta}_y(\mat{\theta}))) \\
% &= h(\mat{X})\exp(\tr((\overline{\mat{\eta}} + \mat{\beta}_1\mat{F}_y\t{\mat{\beta}_2})\t{\mat{X}}) + \tr(\mat{\Omega}_1\mat{X}\mat{\Omega}_2\t{\mat{X}}) - b(\mat{\eta}_y(\mat{\theta}))).
% \end{align*}
% The MLE estimate $\hat{\mat{\theta}}_n = (\widehat{\overline{\mat{\eta}}}, \widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1, \widehat{\mat{\Omega}}_2\otimes \widehat{\mat{\Omega}}_2)$ is asymptotically normal and
% \begin{displaymath}
% \widehat{\ten{R}}(\mat{X}) = \t{(\widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1)}\vec(\mat{X} - \E\mat{X}) \equiv \t{\widehat{\mat{\beta}}_1}(\mat{X} - \E\mat{X})\widehat{\mat{\beta}}_2
% \end{displaymath}
% is the MLE of the sufficient reduction $\ten{R}(\mat{X})$ of dimension $q_1 \times q_2 \leq p_1 \times p_2 $.\\[1.6em]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\appendix \appendix

151
LaTeX/plots/sim-ising.tex Normal file
View File

@ -0,0 +1,151 @@
%%% R code to generate the input data files from corresponding simulation logs
% R> setwd("~/Work/tensorPredictors")
% R>
% R> for (sim.name in c("2a")) {
% R> pattern <- paste0("sim\\_", sim.name, "\\_ising\\-[0-9T]*\\.csv")
% R> log.file <- sort(
% R> list.files(path = "sim/", pattern = pattern, full.names = TRUE),
% R> decreasing = TRUE
% R> )[[1]]
% R>
% R> sim <- read.csv(log.file)
% R>
% R> aggr <- aggregate(sim[, names(sim) != "sample.size"], list(sample.size = sim$sample.size), mean)
% R>
% R> write.table(aggr, file = paste0("LaTeX/plots/aggr-", sim.name, "-ising.csv"), row.names = FALSE, quote = FALSE)
% R> }
\documentclass[border=0cm]{standalone}
\usepackage{tikz}
\usepackage{pgfplots}
\usepackage{bm}
\definecolor{gmlm}{RGB}{0,0,0}
\definecolor{mgcca}{RGB}{86,180,233}
\definecolor{tsir}{RGB}{0,158,115}
\definecolor{pca}{RGB}{240,228,66}
\definecolor{hopca}{RGB}{230,159,0}
\definecolor{lpca}{RGB}{127,127,127}
\definecolor{clpca}{RGB}{191,191,191}
\pgfplotsset{
every axis/.style={
xtick={100,200,300,500,750},
ymin=-0.05, ymax=1.05,
grid=both,
grid style={gray, dotted}
},
every axis plot/.append style={
mark = *,
mark size = 1pt,
line width=0.8pt
}
}
\tikzset{
legend entry/.style={
mark = *,
mark size = 1pt,
mark indices = {2},
line width=0.8pt
}
}
\begin{document}
\begin{tikzpicture}[>=latex]
\begin{axis}[
name=sim-2a
]
\addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1a-normal.csv};
\addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-2a-ising.csv};
\addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-2a-ising.csv};
\addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-2a-ising.csv};
\addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-2a-ising.csv};
\addplot[color = lpca] table[x = sample.size, y = dist.subspace.lpca] {aggr-2a-ising.csv};
\addplot[color = clpca] table[x = sample.size, y = dist.subspace.clpca] {aggr-2a-ising.csv};
\end{axis}
\node[anchor = base west, yshift = 0.3em] at (sim-2a.north west) {
a: small
};
% \begin{axis}[
% name=sim-1b,
% anchor=north west, at={(sim-2a.right of north east)}, xshift=0.1cm,
% xticklabel=\empty,
% ylabel near ticks, yticklabel pos=right
% ]
% \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1b-normal.csv};
% \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1b-normal.csv};
% \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1b-normal.csv};
% \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1b-normal.csv};
% \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1b-normal.csv};
% \end{axis}
% \node[anchor = base west, yshift = 0.3em] at (sim-1b.north west) {
% b: cubic dependence on $y$
% };
% \begin{axis}[
% name=sim-1c,
% anchor=north west, at={(sim-2a.below south west)}, yshift=-.8em,
% xticklabel=\empty
% ]
% \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1c-normal.csv};
% \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1c-normal.csv};
% \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1c-normal.csv};
% \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1c-normal.csv};
% \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1c-normal.csv};
% \end{axis}
% \node[anchor = base west, yshift = 0.3em] at (sim-1c.north west) {
% c: rank $1$ $\boldsymbol{\beta}$'s
% };
% \begin{axis}[
% name=sim-1d,
% anchor=north west, at={(sim-1c.right of north east)}, xshift=0.1cm,
% ylabel near ticks, yticklabel pos=right
% ]
% \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1d-normal.csv};
% \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1d-normal.csv};
% \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1d-normal.csv};
% \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1d-normal.csv};
% \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1d-normal.csv};
% \end{axis}
% \node[anchor = base west, yshift = 0.3em] at (sim-1d.north west) {
% d: tri-diagonal $\boldsymbol{\Omega}$'s
% };
% \begin{axis}[
% name=sim-1e,
% anchor=north west, at={(sim-1c.below south west)}, yshift=-.8em
% ]
% \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1e-normal.csv};
% \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1e-normal.csv};
% \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1e-normal.csv};
% \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1e-normal.csv};
% \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1e-normal.csv};
% \end{axis}
% \node[anchor = base west, yshift = 0.3em] at (sim-1e.north west) {
% e: missspecified
% };
\matrix[anchor = west] at (sim-2a.right of east) {
\draw[color=gmlm, legend entry, line width=1pt] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {GMLM}; \\
\draw[color=tsir, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {TSIR}; \\
\draw[color=mgcca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {MGCCA}; \\
\draw[color=hopca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {HOPCA}; \\
\draw[color=pca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {PCA}; \\
\draw[color=lpca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {LPCA}; \\
\draw[color=clpca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {CLPCA}; \\
};
\node[anchor = north] at (current bounding box.south) {Sample Size $n$};
\node[anchor = south, rotate = 90] at (current bounding box.west) {Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$};
\node[anchor = south, rotate = 270] at (current bounding box.east) {\phantom{Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$}};
\node[anchor = south, font=\large] at (current bounding box.north) {Tensor Normal Simulation};
\end{tikzpicture}
\end{document}

148
LaTeX/plots/sim-normal.tex Normal file
View File

@ -0,0 +1,148 @@
%%% R code to generate the input data files from corresponding simulation logs
% R> setwd("~/Work/tensorPredictors")
% R>
% R> for (sim.name in c("1a", "1b", "1c", "1d", "1e")) {
% R> pattern <- paste0("sim\\_", sim.name, "\\_normal\\-[0-9T]*\\.csv")
% R> log.file <- sort(
% R> list.files(path = "sim/", pattern = pattern, full.names = TRUE),
% R> decreasing = TRUE
% R> )[[1]]
% R>
% R> sim <- read.csv(log.file)
% R>
% R> aggr <- aggregate(sim[, names(sim) != "sample.size"], list(sample.size = sim$sample.size), mean)
% R>
% R> write.table(aggr, file = paste0("LaTeX/plots/aggr-", sim.name, "-normal.csv"), row.names = FALSE, quote = FALSE)
% R> }
\documentclass[border=0cm]{standalone}
\usepackage{tikz}
\usepackage{pgfplots}
\usepackage{bm}
\definecolor{gmlm}{RGB}{0,0,0}
\definecolor{mgcca}{RGB}{86,180,233}
\definecolor{tsir}{RGB}{0,158,115}
\definecolor{hopca}{RGB}{230,159,0}
\definecolor{pca}{RGB}{240,228,66}
\definecolor{lpca}{RGB}{0,114,178}
\definecolor{clpca}{RGB}{213,94,0}
\pgfplotsset{
every axis/.style={
xtick={100,200,300,500,750},
ymin=-0.05, ymax=1.05,
grid=both,
grid style={gray, dotted}
},
every axis plot/.append style={
mark = *,
mark size = 1pt,
line width=0.8pt
}
}
\tikzset{
legend entry/.style={
mark = *,
mark size = 1pt,
mark indices = {2},
line width=0.8pt
}
}
\begin{document}
\begin{tikzpicture}[>=latex]
\begin{axis}[
name=sim-1a,
xticklabel=\empty
]
\addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1a-normal.csv};
\addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1a-normal.csv};
\addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1a-normal.csv};
\addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1a-normal.csv};
\addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1a-normal.csv};
\end{axis}
\node[anchor = base west, yshift = 0.3em] at (sim-1a.north west) {
a: linear dependence on $\mathcal{F}_y \equiv y$
};
\begin{axis}[
name=sim-1b,
anchor=north west, at={(sim-1a.right of north east)}, xshift=0.1cm,
xticklabel=\empty,
ylabel near ticks, yticklabel pos=right
]
\addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1b-normal.csv};
\addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1b-normal.csv};
\addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1b-normal.csv};
\addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1b-normal.csv};
\addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1b-normal.csv};
\end{axis}
\node[anchor = base west, yshift = 0.3em] at (sim-1b.north west) {
b: cubic dependence on $y$
};
\begin{axis}[
name=sim-1c,
anchor=north west, at={(sim-1a.below south west)}, yshift=-.8em,
xticklabel=\empty
]
\addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1c-normal.csv};
\addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1c-normal.csv};
\addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1c-normal.csv};
\addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1c-normal.csv};
\addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1c-normal.csv};
\end{axis}
\node[anchor = base west, yshift = 0.3em] at (sim-1c.north west) {
c: rank $1$ $\boldsymbol{\beta}$'s
};
\begin{axis}[
name=sim-1d,
anchor=north west, at={(sim-1c.right of north east)}, xshift=0.1cm,
ylabel near ticks, yticklabel pos=right
]
\addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1d-normal.csv};
\addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1d-normal.csv};
\addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1d-normal.csv};
\addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1d-normal.csv};
\addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1d-normal.csv};
\end{axis}
\node[anchor = base west, yshift = 0.3em] at (sim-1d.north west) {
d: tri-diagonal $\boldsymbol{\Omega}$'s
};
\begin{axis}[
name=sim-1e,
anchor=north west, at={(sim-1c.below south west)}, yshift=-.8em
]
\addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1e-normal.csv};
\addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1e-normal.csv};
\addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1e-normal.csv};
\addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1e-normal.csv};
\addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1e-normal.csv};
\end{axis}
\node[anchor = base west, yshift = 0.3em] at (sim-1e.north west) {
e: missspecified
};
\matrix[anchor = center] at (sim-1e.right of east -| sim-1d.south) {
\draw[color=gmlm, legend entry, line width=1pt] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {GMLM}; \\
\draw[color=tsir, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {TSIR}; \\
\draw[color=mgcca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {MGCCA}; \\
\draw[color=hopca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {HOPCA}; \\
\draw[color=pca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {PCA}; \\
};
\node[anchor = north] at (current bounding box.south) {Sample Size $n$};
\node[anchor = south, rotate = 90] at (current bounding box.west) {Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$};
\node[anchor = south, rotate = 270] at (current bounding box.east) {\phantom{Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$}};
\node[anchor = south, font=\large] at (current bounding box.north) {Tensor Normal Simulation};
\end{tikzpicture}
\end{document}

View File

@ -11,8 +11,8 @@ export(.projRank)
export(.projSymBand) export(.projSymBand)
export(.projSymRank) export(.projSymRank)
export(CISE) export(CISE)
export(D) export(Dup)
export(D.pinv) export(Dup.pinv)
export(GMLM) export(GMLM)
export(GMLM.default) export(GMLM.default)
export(HOPCA) export(HOPCA)
@ -32,6 +32,7 @@ export(RMReg)
export(RMap) export(RMap)
export(S) export(S)
export(TSIR) export(TSIR)
export(approx.kron)
export(approx.kronecker) export(approx.kronecker)
export(colKronecker) export(colKronecker)
export(decompose.kronecker) export(decompose.kronecker)
@ -41,6 +42,7 @@ export(dist.projection)
export(dist.subspace) export(dist.subspace)
export(exprs.all.equal) export(exprs.all.equal)
export(gmlm_ising) export(gmlm_ising)
export(gmlm_mlm)
export(gmlm_tensor_normal) export(gmlm_tensor_normal)
export(icu_tensor_normal) export(icu_tensor_normal)
export(ising_m2) export(ising_m2)
@ -72,6 +74,7 @@ export(reduce)
export(riccati) export(riccati)
export(rowKronecker) export(rowKronecker)
export(rtensornorm) export(rtensornorm)
export(slice.assign.expr)
export(slice.expr) export(slice.expr)
export(slice.select) export(slice.select)
export(sylvester) export(sylvester)

View File

@ -1,3 +1,31 @@
#' @examples
#' nrows <- c(3, 2, 5)
#' ncols <- c(2, 4, 8)
#'
#' A <- Reduce(kronecker, Map(matrix, Map(rnorm, nrows * ncols), nrows))
#'
#' all.equal(A, Reduce(kronecker, approx.kron(A, nrows, ncols)))
#'
#' @export
approx.kron <- function(A, nrows, ncols) {
# rearrange kronecker product `A` into outer product `R`
dim(A) <- c(rev(nrows), rev(ncols))
axis.perm <- as.vector(t(matrix(seq_along(dim(A)), ncol = 2L))[, rev(seq_along(nrows))])
R <- aperm(A, axis.perm, resize = FALSE)
dim(R) <- nrows * ncols
# scaling factor for every product component
s <- sum(A^2)^(1 / length(dim(A)))
# higher order SVD on `R` with one singular vector
Map(function(mode, nr, nc) {
s * `dim<-`(La.svd(mcrossprod(R, mode = mode), 1L, 0L)$u, c(nr, nc))
}, seq_along(nrows), nrows, ncols)
}
#' Approximates Kronecker Product decomposition. #' Approximates Kronecker Product decomposition.
#' #'
#' Approximates the matrices `A` and `B` such that #' Approximates the matrices `A` and `B` such that

View File

@ -1,42 +1,110 @@
#' Specialized version of the GMLM for the Ising model (inverse Ising problem) #' Specialized version of the GMLM for the Ising model (inverse Ising problem)
#' #'
#' @todo TODO: Add beta and Omega projections
#'
#' @export #' @export
gmlm_ising <- function(X, F, sample.axis = length(dim(X)), gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
# proj.betas = ..., proj.Omegas = ..., # TODO: this
max.iter = 1000L, max.iter = 1000L,
eps = sqrt(.Machine$double.eps), eps = sqrt(.Machine$double.eps),
step.size = function(iter) 1e-2 * (1000 / (iter + 1000)), step.size = 1e-3,
zig.zag.threashold = 5L, zig.zag.threashold = 20L,
patience = 5L, patience = 3L,
nr.slices = 10L, # only for univariate `F(y) = y` nr.slices = 20L, # only for univariate `F(y) = y`
slice.method = c("cut", "ecdf"), # only for univariate `F(y) = y` and `y` is a factor or integer slice.method = c("cut", "ecdf", "none"), # only for univariate `F(y) = y` and `y` is a factor or integer
logger = function(...) { } logger = function(...) { }
) { ) {
# # Special case for univariate response `vec F(y) = y` # Get problem dimensions
# # Due to high computational costs we use slicing dimX <- dim(X)[-sample.axis]
# if (length(F) == prod(dim(F))) { # threat scalar `F` as a tensor
# y <- as.vector(F) if (is.null(dim(F))) {
# if (!(is.factor(y) || is.integer(y))) { dimF <- rep(1L, length(dimX))
# slice.method <- match.arg(slice.method) dim(F) <- ifelse(seq_along(dim(X)) == sample.axis, sample.size, 1L)
# if (slice.method == "ecdf") { } else {
# y <- cut(ecdf(y)(y), nr.slices) dimF <- dim(F)[-sample.axis]
# } else { }
# y <- cut(y, nr.slices) sample.size <- dim(X)[sample.axis]
# }
# }
# slices <- split(seq_len(sample.size), y, drop = TRUE)
# } else {
# slices <- seq_len(sample.size)
# }
dimX <- head(dim(X), -1) # rearrange `X`, `F` such that the last axis enumerates observations
dimF <- head(dim(F), -1) if (sample.axis != length(dim(X))) {
axis.perm <- c(seq_along(dim(X))[-sample.axis], sample.axis)
X <- aperm(X, axis.perm)
F <- aperm(F, axis.perm)
sample.axis <- length(dim(X)) sample.axis <- length(dim(X))
modes <- seq_len(length(dim(X)) - 1) }
sample.size <- tail(dim(X), 1) modes <- seq_along(dimX)
betas <- Map(matrix, Map(rnorm, dimX * dimF), dimX) # Special case for univariate response `vec F(y) = y`
Omegas <- Map(diag, dimX) # Due to high computational costs we use slicing
slice.method <- match.arg(slice.method)
slices.ind <- if ((slice.method != "none") && (length(F) == prod(dim(F)))) {
y <- as.vector(F)
if (!(is.factor(y) || is.integer(y))) {
slice.method <- match.arg(slice.method)
if (slice.method == "ecdf") {
y <- cut(ecdf(y)(y), nr.slices)
} else {
y <- cut(y, nr.slices)
}
}
split(seq_len(sample.size), y, drop = TRUE)
} else {
seq_len(sample.size)
}
# initialize betas with tensor normal estimate (ignoring data being binary)
fit_normal <- gmlm_tensor_normal(X, F, sample.axis = length(dim(X)))
betas <- fit_normal$betas
Omegas <- Omegas.init <- Map(function(mode) {
n <- prod(dim(X)[-mode])
prob2 <- mcrossprod(X, mode = mode) / n
prob2[prob2 == 0] <- 1 / n
prob1 <- diag(prob2)
`prob1^2` <- outer(prob1, prob1)
`diag<-`(log(((1 - `prob1^2`) / `prob1^2`) * prob2 / (1 - prob2)), 0)
}, modes)
# Determin degenerate combinations, that are variables which are exclusive
# in the data set
matX <- mat(X, sample.axis)
degen <- crossprod(matX) == 0
degen.mask <- which(degen)
# If there are degenerate combination, compute an (arbitrary) bound the
# log odds parameters of those combinations
if (any(degen.mask)) {
degen.ind <- arrayInd(degen.mask, dim(degen))
meanX <- colMeans(matX)
prodX <- meanX[degen.ind[, 1]] * meanX[degen.ind[, 2]]
degen.bounds <- log((1 - prodX) / (prodX * sample.size))
# Component indices in Omegas of degenerate two-way interactions
degen.ind <- arrayInd(degen.mask, rep(dimX, 2))
degen.ind <- Map(function(d, m) {
degen.ind[, m] + dimX[m] * (degen.ind[, m + length(dimX)] - 1L)
}, dimX, seq_along(dimX))
## Enforce initial value degeneracy interaction param. constraints
# Extract parameters corresponding to degenerate interactions
degen.params <- do.call(rbind, Map(`[`, Omegas, degen.ind))
# Degeneracy Constrained Parameters (sign is dropped)
DCP <- mapply(function(vals, bound) {
logVals <- log(abs(vals))
err <- max(0, sum(logVals) - log(abs(bound)))
exp(logVals - (err / length(vals)))
}, split(degen.params, col(degen.params)), degen.bounds)
# Update values in Omegas such that all degeneracy constraints hold
Omegas <- Map(function(Omega, cp, ind) {
# Combine multiple constraints for every element into single
# constraint value per element
cp <- mapply(min, split(abs(cp), ind))
ind <- as.integer(names(cp))
`[<-`(Omega, ind, sign(Omega[ind]) * cp)
}, Omegas, split(DCP, row(DCP)), degen.ind)
}
# Initialize mean squared gradients
grad2_betas <- Map(array, 0, Map(dim, betas)) grad2_betas <- Map(array, 0, Map(dim, betas))
grad2_Omegas <- Map(array, 0, Map(dim, Omegas)) grad2_Omegas <- Map(array, 0, Map(dim, Omegas))
@ -49,12 +117,8 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
non_improving <- 0L non_improving <- 0L
# technical access points to dynamicaly access a multi-dimensional array # technical access points to dynamicaly access a multi-dimensional array
# with its last index `X[..., i]` <- slice.expr(X, sample.axis, index = i, drop = FALSE)
`X[..., i]` <- slice.expr(X, sample.axis, index = i)
`F[..., i]` <- slice.expr(F, sample.axis, index = i, drop = FALSE) `F[..., i]` <- slice.expr(F, sample.axis, index = i, drop = FALSE)
# the next expression if accessing the precomputed `mlm(F, betas)`
`BF[..., i]` <- slice.expr(BF, sample.axis, nr.axis = sample.axis, drop = FALSE) # BF[..., i]
# Iterate till a break condition triggers or till max. nr. of iterations # Iterate till a break condition triggers or till max. nr. of iterations
for (iter in seq_len(max.iter)) { for (iter in seq_len(max.iter)) {
@ -62,31 +126,36 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
grad_betas <- Map(matrix, 0, dimX, dimF) grad_betas <- Map(matrix, 0, dimX, dimF)
Omega <- Reduce(kronecker, rev(Omegas)) Omega <- Reduce(kronecker, rev(Omegas))
# second order residuals accumulator
# `sum_i (X_i o X_i - E[X o X | Y = y_i])`
R2 <- array(0, dim = c(dimX, dimX)) R2 <- array(0, dim = c(dimX, dimX))
# negative log-likelihood # negative log-likelihood
loss <- 0 loss <- 0
BF <- mlm(F, betas) for (i in slices.ind) {
# slice size (nr. of objects in the slice)
n_i <- length(i)
for (i in seq_len(sample.size)) { sumF_i <- rowSums(eval(`F[..., i]`), dims = length(dimF))
params_i <- Omega + diag(as.vector(eval(`BF[..., i]`)))
diag_params_i <- mlm(sumF_i / n_i, betas)
params_i <- Omega + diag(as.vector(diag_params_i))
m2_i <- ising_m2(params_i) m2_i <- ising_m2(params_i)
# accumulate loss # accumulate loss
x_i <- as.vector(eval(`X[..., i]`)) matX_i <- mat(eval(`X[..., i]`), modes)
loss <- loss - (sum(x_i * (params_i %*% x_i)) + log(attr(m2_i, "prob_0"))) loss <- loss - (
sum(matX_i * (params_i %*% matX_i)) + n_i * log(attr(m2_i, "prob_0"))
)
R2_i <- tcrossprod(x_i) - m2_i R2_i <- tcrossprod(matX_i) - n_i * m2_i
R1_i <- diag(R2_i) R1_i <- diag(R2_i)
dim(R1_i) <- dimX dim(R1_i) <- dimX
for (j in modes) { for (j in modes) {
grad_betas[[j]] <- grad_betas[[j]] + grad_betas[[j]] <- grad_betas[[j]] +
mcrossprod( mcrossprod(R1_i, mlm(sumF_i, betas[-j], modes[-j]), j)
R1_i, mlm(eval(`F[..., i]`), betas[-j], modes[-j]), j,
dimB = ifelse(j != modes, dimX, dimF)
)
} }
R2 <- R2 + as.vector(R2_i) R2 <- R2 + as.vector(R2_i)
} }
@ -98,17 +167,14 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
}, modes) }, modes)
# update and accumulate alternating loss # update optimization behavioral trackers
accum_sign <- sign(last_loss - loss) - accum_sign accum_sign <- sign(last_loss - loss) - accum_sign
# check if accumulated alternating signs exceed stopping threshold non_improving <- max(0L, non_improving - 1L + 2L * (last_loss < loss))
# check break conditions
if (abs(accum_sign) > zig.zag.threashold) { break } if (abs(accum_sign) > zig.zag.threashold) { break }
# increment non improving counter if thats the case
if (!(loss < last_loss)) {
non_improving <- non_improving + 1L
} else {
non_improving <- 0L
}
if (non_improving > patience) { break } if (non_improving > patience) { break }
if (abs(last_loss - loss) < eps * last_loss) { break }
# store current loss for the next iteration # store current loss for the next iteration
last_loss <- loss last_loss <- loss
@ -122,17 +188,219 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
# logging (before parameter update) # logging (before parameter update)
logger(iter, loss, betas, Omegas, grad_betas, grad_Omegas) logger(iter, loss, betas, Omegas, grad_betas, grad_Omegas)
# gradualy decrease the step size
step <- if (is.function(step.size)) step.size(iter) else step.size
# Update Parameters # Update Parameters
betas <- Map(function(beta, grad, m2) { betas <- Map(function(beta, grad, m2) {
beta + (step / (sqrt(m2) + eps)) * grad beta + (step.size / (sqrt(m2) + eps)) * grad
}, betas, grad_betas, grad2_betas) }, betas, grad_betas, grad2_betas)
Omegas <- Map(function(Omega, grad, m2) { Omegas <- Map(function(Omega, grad, m2) {
Omega + (step / (sqrt(m2) + eps)) * grad Omega + (step.size / (sqrt(m2) + eps)) * grad
}, Omegas, grad_Omegas, grad2_Omegas) }, Omegas, grad_Omegas, grad2_Omegas)
# Enforce degeneracy parameter constraints
if (any(degen.mask)) {
# Extract parameters corresponding to degenerate interactions
degen.params <- do.call(rbind, Map(`[`, Omegas, degen.ind))
# Degeneracy Constrained Parameters (sign is dropped)
DCP <- mapply(function(vals, bound) {
logVals <- log(abs(vals))
err <- max(0, sum(logVals) - log(abs(bound)))
exp(logVals - (err / length(vals)))
}, split(degen.params, col(degen.params)), degen.bounds)
# Update values in Omegas such that all degeneracy constraints hold
Omegas <- Map(function(Omega, cp, ind) {
# Combine multiple constraints for every element into single
# constraint value per element
cp <- mapply(min, split(abs(cp), ind))
ind <- as.integer(names(cp))
`[<-`(Omega, ind, sign(Omega[ind]) * cp)
}, Omegas, split(DCP, row(DCP)), degen.ind)
}
} }
list(betas = betas, Omegas = Omegas) structure(
list(eta1 = array(0, dimX), betas = betas, Omegas = Omegas),
tensor_normal = fit_normal,
Omegas.init = Omegas.init,
degen.mask = degen.mask
)
}
################################################################################
### Development Interactive Block (Delete / Make sim / TODO: ...) ###
################################################################################
if (FALSE) { # interactive()
par(bg = "#1d1d1d",
fg = "lightgray",
col = "#d5d5d5",
col.axis = "#d5d5d5",
col.lab = "#d5d5d5",
col.main = "#d5d5d5",
col.sub = "#d5d5d5", # col.sub = "#2467d0"
pch = 16
)
cex <- 1.25
col <- colorRampPalette(c("#f15050", "#1d1d1d", "#567DCA"))(256)
.logger <- function() {
iter <- 0L
assign("log", data.frame(
iter = rep(NA_integer_, 100000),
loss = rep(NA_real_, 100000),
dist.B = rep(NA_real_, 100000),
dist.Omega = rep(NA_real_, 100000),
norm.grad.B = rep(NA_real_, 100000),
norm.grad.Omega = rep(NA_real_, 100000)
), envir = .GlobalEnv)
assign("B.gmlm", NULL, .GlobalEnv)
assign("Omega.gmlm", NULL, .GlobalEnv)
function(it, loss, betas, Omegas, grad_betas, grad_Omegas) {
# Store in global namespace (allows to stop and get the results)
B.gmlm <- Reduce(kronecker, rev(betas))
assign("B.gmlm", B.gmlm, .GlobalEnv)
Omega.gmlm <- Reduce(kronecker, rev(Omegas))
assign("Omega.gmlm", Omega.gmlm, .GlobalEnv)
dist.B <- dist.subspace(B.true, B.gmlm, normalize = TRUE)
dist.Omega <- norm(Omega.true - Omega.gmlm, "F")
norm.grad.B <- sqrt(sum(mapply(norm, grad_betas, "F")^2))
norm.grad.Omega <- sqrt(sum(mapply(norm, grad_Omegas, "F")^2))
log[iter <<- iter + 1L, ] <<- list(
it, loss, dist.B, dist.Omega, norm.grad.B, norm.grad.Omega
)
cat(sprintf("\r%3d - d(B): %.3f, d(O): %.3f, |g(B)|: %.3f, |g(O)|: %.3f, loss: %.3f\033[K",
it, dist.B, dist.Omega, norm.grad.B, norm.grad.Omega, loss))
}
}
sample.size <- 1000
dimX <- c(2, 3) # predictor `X` dimension
dimF <- rep(1, length(dimX)) # "function" `F(y)` of responce `y` dimension
betas <- Map(diag, 1, dimX, dimF)
Omegas <- list(toeplitz(c(0, -2)), toeplitz(seq(1, 0, by = -0.5)))
B.true <- Reduce(kronecker, rev(betas))
Omega.true <- Reduce(kronecker, rev(Omegas))
# data sampling routine
c(X, F, y, sample.axis) %<-% (sample.data <- function(sample.size, betas, Omegas) {
dimX <- mapply(nrow, betas)
dimF <- mapply(ncol, betas)
# generate response (sample axis is last axis)
y <- runif(prod(sample.size, dimF), -2, 2)
F <- array(y, dim = c(dimF, sample.size)) # ~ U[-1, 1]
Omega <- Reduce(kronecker, rev(Omegas))
X <- apply(F, length(dim(F)), function(Fi) {
dim(Fi) <- dimF
params <- diag(as.vector(mlm(Fi, betas))) + Omega
tensorPredictors::ising_sample(1, params)
})
dim(X) <- c(dimX, sample.size)
list(X = X, F = F, y = y, sample.axis = length(dim(X)))
})(sample.size, betas, Omegas)
local({
X.proto <- array(seq_len(prod(dimX)), dimX)
interactions <- crossprod(mat(X, sample.axis))
dimnames(interactions) <- rep(list(
do.call(paste0, c("X", Map(slice.index, list(X.proto), seq_along(dimX))))
), 2)
cat("Sample Size: ", sample.size, "\n")
print.table(interactions, zero.print = ".")
})
# system.time({
# fit.gmlm <- gmlm_ising(X, y, logger = .logger())
# })
Rprof()
gmlm_ising(X, y)
Rprof(NULL)
summaryRprof()
B.gmlm <- Reduce(kronecker, rev(fit.gmlm$betas))
Omega.gmlm <- Reduce(kronecker, rev(fit.gmlm$Omegas))
B.normal <- Reduce(kronecker, rev(attr(fit.gmlm, "tensor_normal")$betas))
Omega.init <- Reduce(kronecker, rev(attr(fit.gmlm, "Omegas.init")))
degen.mask <- attr(fit.gmlm, "degen.mask")
local({
layout(matrix(c(
1, 2, 3, 3, 3,
1, 4, 5, 6, 7
), nrow = 2, byrow = TRUE), width = c(6, 3, 1, 1, 1))
with(na.omit(log), {
plot(range(iter), c(0, 1), type = "n", bty = "n",
xlab = "Iterations", ylab = "Distance")
lines(iter, dist.B, col = "red", lwd = 2)
lines(iter, dist.Omega / max(dist.Omega), col = "blue", lwd = 2)
lines(iter, (loss - min(loss)) / diff(range(loss)), col = "darkgreen", lwd = 2)
norm.grad <- sqrt(norm.grad.B^2 + norm.grad.Omega^2)
# Scale all gradient norms
norm.grad.B <- norm.grad.B / max(norm.grad)
norm.grad.Omega <- norm.grad.Omega / max(norm.grad)
norm.grad <- norm.grad / max(norm.grad)
lines(iter, norm.grad.B, lty = 2, col = "red")
lines(iter, norm.grad.Omega, lty = 2, col = "blue")
lines(iter, norm.grad, lty = 2, col = "darkgreen")
axis(4, at = c(
tail(dist.B, 1),
min(dist.B)
), labels = round(c(
tail(dist.B, 1),
min(dist.B)
), 2), col = NA, col.ticks = "red", las = 1)
axis(4, at = c(
1,
tail(dist.Omega, 1) / max(dist.Omega),
min(dist.Omega) / max(dist.Omega)
), labels = round(c(
max(dist.Omega),
tail(dist.Omega, 1),
min(dist.Omega)
), 2), col = NA, col.ticks = "blue", las = 1)
abline(h = c(tail(dist.B, 1), min(dist.B)),
lty = "dotted", col = "red")
abline(h = c(max(dist.Omega), tail(dist.Omega, 1), min(dist.Omega)) / max(dist.Omega),
lty = "dotted", col = "blue")
})
legend("topright", col = c("red", "blue", "darkgreen"), lty = 1, lwd = 2,
legend = c("dist.B", "dist.Omega", "loss"), bty = "n")
zlim <- max(abs(range(Omega.true, Omega.init, Omega.gmlm))) * c(-1, 1)
matrixImage(Omega.true, main = "true", zlim = zlim, add.values = TRUE, col = col, cex = cex)
matrixImage(round(Omega.init, 2), main = "init (cond. prob.)", zlim = zlim, add.values = TRUE, col = col, cex = cex)
mtext(round(norm(Omega.true - Omega.init, "F"), 3), 3)
matrixImage(round(Omega.gmlm, 2), main = "gmlm (ising)", zlim = zlim, add.values = TRUE, col = col, cex = cex,
col.values = c(par("col"), "red")[`[<-`(array(1, rep(prod(dim(X)[-sample.axis]), 2)), degen.mask, 2)])
mtext(round(norm(Omega.true - Omega.gmlm, "F"), 3), 3)
zlim <- max(abs(range(B.true, B.normal, B.gmlm))) * c(-1, 1)
matrixImage(B.true, main = "true",
zlim = zlim, add.values = TRUE, col = col, cex = cex)
matrixImage(round(B.normal, 2), main = "init (normal)",
zlim = zlim, add.values = TRUE, axes = FALSE, col = col, cex = cex)
mtext(round(dist.subspace(B.true, B.normal, normalize = TRUE), 3), 3)
matrixImage(round(B.gmlm, 2), main = "gmlm (ising)",
zlim = zlim, add.values = TRUE, axes = FALSE, col = col, cex = cex)
mtext(round(dist.subspace(B.true, B.gmlm, normalize = TRUE), 3), 3)
})
} }

View File

@ -1,109 +1,10 @@
# p <- 5
# A <- matrix(rnorm(p^2), p)
# mat.proj("TriDiag", p)(A)
# mat.proj("SymTriDiag", p)(A)
# A
# (AA <- mat.proj("PSD", p)(A))
# mat.proj("PSD", p)(AA)
# p <- 5
# A <- matrix(rnorm(p^2), p)
# projection <- function(T2) {
# P <- pinv(T2) %*% T2
# function(A) {
# matrix(P %*% as.vector(A), nrow(A))
# }
# }
# # All equal diagonal matrix, A = a * I_p
# T2 <- t(as.vector(diag(p)))
# proj <- projection(T2)
# print.table( diag(mean(diag(A)), nrow(A), ncol(A)) , zero.print = ".")
# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p) , zero.print = ".")
# print.table( proj(A) , zero.print = ".")
# # Diagonal matrix, A = diag(a_1, ..., a_p)
# T2 <- matrix(seq_len(p^2), p)
# T2 <- outer(diag(T2), as.vector(T2), `==`)
# storage.mode(T2) <- "double"
# proj <- projection(T2)
# print.table( T2 , zero.print = ".")
# print.table( diag(diag(A)) , zero.print = ".")
# print.table( proj(A) , zero.print = ".")
# # Tri-Diagonal Matrix
# T2 <- matrix(seq_len(p^2), p)
# T2 <- outer(T2[abs(row(A) - col(A)) <= 1], as.vector(T2), `==`)
# storage.mode(T2) <- "double"
# triDiag.mask <- (abs(row(A) - col(A)) <= 1)
# storage.mode(triDiag.mask) <- "double"
# print.table( T2 , zero.print = ".")
# print.table( triDiag.mask , zero.print = ".")
# print.table( A * triDiag.mask , zero.print = ".")
# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p) , zero.print = ".")
# # All equal main and off diagonals
# T2 <- Reduce(rbind, Map(function(i) {
# as.double(row(A) == i + col(A))
# }, (1 - p):(p - 1)))
# print.table( T2 , zero.print = ".")
# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p) , zero.print = ".")
# # Symmetric all equal main and off diagonals
# T2 <- Reduce(rbind, Map(function(i) {
# as.double(abs(row(A) - col(A)) == i)
# }, 0:(p - 1)))
# print.table( T2 , zero.print = ".")
# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p) , zero.print = ".")
# # Symetric Matrix
# index <- matrix(seq_len(p^2), p)
# T2 <- Reduce(rbind, Map(function(i) {
# e_i <- index == i
# as.double(e_i | t(e_i))
# }, index[lower.tri(index, diag = TRUE)]))
# proj <- projection(T2)
# print.table( T2 , zero.print = ".")
# print.table( 0.5 * (A + t(A)) , zero.print = ".")
# print.table( proj(A) , zero.print = ".")
# proj(solve(A))
# solve(proj(A))
#' Specialized version of GMLM for the tensor normal model #' Specialized version of GMLM for the tensor normal model
#' #'
#' The underlying algorithm is an ``iterative (block) coordinate descent'' method #' The underlying algorithm is an ``iterative (block) coordinate descent'' method
#' #'
#' @export #' @export
gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)), gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)),
max.iter = 100L, proj.betas = NULL, proj.Omegas = NULL, logger = NULL, # TODO: proj.betas NOT USED!!! max.iter = 100L, proj.betas = NULL, proj.Omegas = NULL, logger = NULL,
cond.threshold = 25, eps = 1e-6 cond.threshold = 25, eps = 1e-6
) { ) {
# rearrange `X`, `F` such that the last axis enumerates observations # rearrange `X`, `F` such that the last axis enumerates observations
@ -152,7 +53,7 @@ gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)),
}, mcov(F, sample.axis, center = FALSE)) }, mcov(F, sample.axis, center = FALSE))
# initialization of betas ``covariance direction mappings`` # initialization of betas ``covariance direction mappings``
betas <- Map(function(dX, dF) { betas <- betas.init <- Map(function(dX, dF) {
s <- min(ncol(dX), nrow(dF)) s <- min(ncol(dX), nrow(dF))
crossprod(dX[1:s, , drop = FALSE], dF[1:s, , drop = FALSE]) crossprod(dX[1:s, , drop = FALSE], dF[1:s, , drop = FALSE])
}, dirsX, dirsF) }, dirsX, dirsF)
@ -226,132 +127,8 @@ gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)),
} }
list(eta1 = meanX, betas = betas, Omegas = Omegas) structure(
list(eta1 = mlm(meanX, Sigmas), betas = betas, Omegas = Omegas),
betas.init = betas.init
)
} }
# #### DEBUGGING
# library(tensorPredictors)
# # setup dimensions
# n <- 1e3
# p <- c(5, 7, 3)
# q <- c(2, 5, 3)
# max.iter <- 10L
# # create "true" GLM parameters
# eta1 <- array(rnorm(prod(p)), dim = p)
# betas <- Map(matrix, Map(rnorm, p * q), p)
# Omegas <- Map(function(p_j) {
# solve(0.5^abs(outer(seq_len(p_j), seq_len(p_j), `-`)))
# }, p)
# true.params <- list(eta1 = eta1, betas = betas, Omegas = Omegas)
# # compute tensor normal parameters from GMLM parameters
# Sigmas <- Map(solve, Omegas)
# mu <- mlm(eta1, Sigmas)
# # sample some test data
# sample.axis <- length(p) + 1L
# F <- array(rnorm(n * prod(q)), dim = c(q, n))
# X <- mlm(F, Map(`%*%`, Sigmas, betas)) + rtensornorm(n, mu, Sigmas, sample.axis)
# # setup a logging callback
# format <- sprintf("iter: %%3d, %s, Omega: %%8.2f, resid: %%8.3f\n",
# paste0("beta.", seq_along(betas), ": %.3f", collapse = ", ")
# )
# hist <- data.frame(
# iter = seq(0, max.iter),
# dist.B = numeric(max.iter + 1),
# dist.Omega = numeric(max.iter + 1),
# resid = numeric(max.iter + 1)
# )
# logger <- function(iter, betas, Omegas, resid) {
# dist.B <- dist.kron.norm(true.params$betas, betas)
# dist.Omega <- dist.kron.norm(true.params$Omegas, Omegas)
# resid <- mean(resid^2)
# hist[iter + 1, -1] <<- c(dist.B = dist.B, dist.Omega = dist.Omega, resid = resid)
# cat(do.call(sprintf, c(
# list(format, iter),
# Map(dist.subspace, betas, true.params$betas),
# dist.Omega, resid
# )))
# }
# # run the model
# est.params <- gmlm_tensor_normal(X, F, max.iter = max.iter, logger = logger)
# # run model with Omegas in sub-manifolds of SPD matrices
# est.params <- gmlm_tensor_normal(X, F, max.iter = max.iter, logger = logger,
# proj.Omegas = list(
# NULL,
# (function(nrow, ncol) {
# triDiag.mask <- (abs(.row(c(nrow, ncol)) - .col(c(nrow, ncol))) <= 1)
# storage.mode(triDiag.mask) <- "double"
# function(A) { A * triDiag.mask }
# })(p[2], p[2]),
# function(A) diag(diag(A))
# ))
# # # Profile the fitting routine without logging
# # profvis::profvis({
# # est.params <- gmlm_tensor_normal(X, F, max.iter = max.iter)
# # })
# with(hist, {
# plot(range(iter), range(hist[, -1]), type = "n", log = "y")
# lines(iter, resid, col = "red")
# lines(iter, dist.B, col = "green")
# lines(iter, dist.Omega, col = "blue")
# legend("topright", legend = c("resid", "dist.B", "dist.Omega"), col = c("red", "green", "blue"), lty = 1)
# })
# # par(mfrow = c(1, 2))
# # matrixImage(Reduce(kronecker, rev(est.params$Omegas)))
# # matrixImage(Reduce(kronecker, rev(true.params$Omegas)))
# # # matrixImage(Reduce(kronecker, rev(est.params$Omegas)) - Reduce(kronecker, rev(true.params$Omegas)))
# # par(mfrow = c(1, 2))
# # matrixImage(Reduce(kronecker, rev(true.params$betas)), main = "True", col = hcl.colors(48, palette = "Blue-Red 3"))
# # matrixImage(Reduce(kronecker, rev(est.params$betas)), main = "Est", col = hcl.colors(48, palette = "Blue-Red 3"))
# # # unlist(Map(kappa, mcov(X)))
# # # unlist(Map(kappa, Omegas))
# # # unlist(Map(kappa, Sigmas))
# # # max(unlist(Map(function(C) max(eigen(C)$values), mcov(X))))
# # # min(unlist(Map(function(C) min(eigen(C)$values), mcov(X))))
# # # Map(function(C) eigen(C)$values, mcov(X))
# # # kappa(Reduce(kronecker, mcov(X)))
# # kappa1.fun <- function(Omegas) Map(kappa, Omegas)
# # kappa2.fun <- function(Omegas) Map(kappa, Omegas, exact = TRUE)
# # eigen1.fun <- function(Omegas) {
# # Map(function(Omega) {
# # min_max <- range(eigen(Omega)$values)
# # min_max[2] / min_max[1]
# # }, Omegas)
# # }
# # eigen2.fun <- function(Omegas) {
# # Map(function(Omega) {
# # min_max <- range(eigen(Omega, TRUE, TRUE)$values)
# # min_max[2] / min_max[1]
# # }, Omegas)
# # }
# # microbenchmark::microbenchmark(
# # kappa1.fun(Omegas),
# # kappa2.fun(Omegas),
# # eigen1.fun(Omegas),
# # eigen2.fun(Omegas)
# # )

View File

@ -19,6 +19,7 @@
matrixImage <- function(A, add.values = FALSE, matrixImage <- function(A, add.values = FALSE,
main = NULL, sub = NULL, interpolate = FALSE, ..., zlim = NA, main = NULL, sub = NULL, interpolate = FALSE, ..., zlim = NA,
axes = TRUE, asp = 1, col = hcl.colors(24, "Blue-Red 3", rev = FALSE), axes = TRUE, asp = 1, col = hcl.colors(24, "Blue-Red 3", rev = FALSE),
col.values = par("col"), cex = 1,
digits = getOption("digits"), new.plot = TRUE digits = getOption("digits"), new.plot = TRUE
) { ) {
# plot raster image # plot raster image
@ -56,6 +57,7 @@ matrixImage <- function(A, add.values = FALSE,
A[!add.values] <- NA A[!add.values] <- NA
A[add.values] <- format(A[add.values], digits = digits) A[add.values] <- format(A[add.values], digits = digits)
} }
text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A, adj = 0.5) text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A,
adj = 0.5, cex = cex, col = col.values)
} }
} }