fix: stability of gmlm_ising and added slicing,
add: higher-order approx.kron, add: paper.tex - simulation plots and tensor normal MLE estimation
This commit is contained in:
parent
6477d78190
commit
399f878fbb
276
LaTeX/paper.tex
276
LaTeX/paper.tex
|
@ -487,15 +487,15 @@ Equation \eqref{eq:eta2} is underdetermined since $\t{(\mat{T}_2\pinv{\mat{D}_p}
|
||||||
\todo{Maybe already here introduce the ``constraint'' set of $\Omega$'s allowed as $\{ \Omega\in\mathbb{R}_{++}^{p\times p} : \vec{\Omega} = \t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p})}\vec{\Omega} \}$}
|
\todo{Maybe already here introduce the ``constraint'' set of $\Omega$'s allowed as $\{ \Omega\in\mathbb{R}_{++}^{p\times p} : \vec{\Omega} = \t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p})}\vec{\Omega} \}$}
|
||||||
|
|
||||||
In a classical \emph{generalized linear model} (GLM), the link function connecting the natural parameters to the expectation of the sufficient statistic $\mat{\eta}_y = \mat{g}(\E[\mat{t}(\ten{X}) \mid Y = y])$ is invertible. Such a link may not exist in our setting, but for our purpose what we call the ``inverse'' link suffices. The ``inverse'' link $\widetilde{\mat{g}}$ exists as the natural parameters fully describe the distribution. As in the non-minimal formulation \eqref{eq:quadratic-exp-fam}, we define the ``inverse'' link through its tensor valued components
|
In a classical \emph{generalized linear model} (GLM), the link function connecting the natural parameters to the expectation of the sufficient statistic $\mat{\eta}_y = \mat{g}(\E[\mat{t}(\ten{X}) \mid Y = y])$ is invertible. Such a link may not exist in our setting, but for our purpose what we call the ``inverse'' link suffices. The ``inverse'' link $\widetilde{\mat{g}}$ exists as the natural parameters fully describe the distribution. As in the non-minimal formulation \eqref{eq:quadratic-exp-fam}, we define the ``inverse'' link through its tensor valued components
|
||||||
\begin{align*}
|
\begin{align}
|
||||||
\ten{g}_1(\mat{\eta}_y) &= \E[\ten{X} \mid Y = y], \\
|
\ten{g}_1(\mat{\eta}_y) &= \E[\ten{X} \mid Y = y], \label{eq:inv-link1}\\
|
||||||
\ten{g}_2(\mat{\eta}_y) &= \E[\ten{X}\circ\ten{X} \mid Y = y]
|
\ten{g}_2(\mat{\eta}_y) &= \E[\ten{X}\circ\ten{X} \mid Y = y] \label{eq:inv-link2}
|
||||||
\end{align*}
|
\end{align}
|
||||||
as $\widetilde{\mat{g}}(\mat{\eta}_y) = (\vec\ten{g}_1(\mat{\eta}_y), \vec\ten{g}_2(\mat{\eta}_y))$.
|
as $\widetilde{\mat{g}}(\mat{\eta}_y) = (\vec\ten{g}_1(\mat{\eta}_y), \vec\ten{g}_2(\mat{\eta}_y))$.
|
||||||
|
|
||||||
Under the quadratic exponential family model \eqref{eq:quadratic-exp-fam}, a sufficient reduction for the regression of $Y$ on $\ten{X}$ is given in \cref{thm:sdr}.
|
Under the quadratic exponential family model \eqref{eq:quadratic-exp-fam}, a sufficient reduction for the regression of $Y$ on $\ten{X}$ is given in \cref{thm:sdr}.
|
||||||
\begin{theorem}[SDR]\label{thm:sdr}
|
\begin{theorem}[SDR]\label{thm:sdr}
|
||||||
A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:quadratic-exp-fam} with natural parameters \eqref{eq:eta1} and \eqref{eq:eta2} is given by
|
A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:quadratic-exp-fam} with natural parameters modeled \eqref{eq:eta1} and \eqref{eq:eta2} is given by
|
||||||
\begin{align}\label{eq:sdr}
|
\begin{align}\label{eq:sdr}
|
||||||
\ten{R}(\ten{X})
|
\ten{R}(\ten{X})
|
||||||
% &= (\ten{X} - \E\ten{X})\times\{ \t{\mat{\beta}_1}, \ldots, \t{\mat{\beta}_r} \}.
|
% &= (\ten{X} - \E\ten{X})\times\{ \t{\mat{\beta}_1}, \ldots, \t{\mat{\beta}_r} \}.
|
||||||
|
@ -551,7 +551,7 @@ The maximum likelihood estimate of $\mat{\theta}_0$ is the solution to the optim
|
||||||
\end{equation}
|
\end{equation}
|
||||||
with $\hat{\mat{\theta}}_n = (\vec\widehat{\overline{\ten{\eta}}}, \vec\widehat{\mat{B}}, \vech\widetilde{\mat{\Omega}})$ where $\widehat{\mat{B}} = \bigkron_{k = r}^{1}\widehat{\mat{\beta}}_k$ and $\widehat{\mat{\Omega}} = \bigkron_{k = r}^{1}\widehat{\mat{\Omega}}_k$.
|
with $\hat{\mat{\theta}}_n = (\vec\widehat{\overline{\ten{\eta}}}, \vec\widehat{\mat{B}}, \vech\widetilde{\mat{\Omega}})$ where $\widehat{\mat{B}} = \bigkron_{k = r}^{1}\widehat{\mat{\beta}}_k$ and $\widehat{\mat{\Omega}} = \bigkron_{k = r}^{1}\widehat{\mat{\Omega}}_k$.
|
||||||
|
|
||||||
A straightforward and general method for parameter estimation is \emph{gradient descent}. For pure algorithmic speedup, by only changing the update rule but \emph{not} the gradient denoted $\nabla_{\mat{\theta}}l_n$ of the objective function, we use an adapted version of \emph{Nesterov Accelerated Gradient} descent (NAG) originaly described in \textcite{Nesterov1983} with an internal line search to determine the step size in each iteration, as described in \cref{alg:NAGD}. For applying gradient based optimization, we compute the gradients of $l_n$ in \cref{thm:grad}.
|
A straightforward and general method for parameter estimation is \emph{gradient descent}. For applying gradient based optimization, we compute the gradients of $l_n$ in \cref{thm:grad}.
|
||||||
|
|
||||||
\begin{theorem}\label{thm:grad}
|
\begin{theorem}\label{thm:grad}
|
||||||
For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the log-likelihood has the form \eqref{eq:log-likelihood} with $\mat{\theta}$ being the collection of all GMLM parameters $\overline{\ten{\eta}}$, ${\mat{B}} = \bigkron_{k = r}^{1}{\mat{\beta}}_k$ and ${\mat{\Omega}} = \bigkron_{k = r}^{1}{\mat{\Omega}}_k$ for $k = 1, ..., r$. Furthermore, let $\ten{G}_2(\mat{\eta}_y)$ be a tensor of dimensions $p_1, \ldots, p_r$ such that
|
For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the log-likelihood has the form \eqref{eq:log-likelihood} with $\mat{\theta}$ being the collection of all GMLM parameters $\overline{\ten{\eta}}$, ${\mat{B}} = \bigkron_{k = r}^{1}{\mat{\beta}}_k$ and ${\mat{\Omega}} = \bigkron_{k = r}^{1}{\mat{\Omega}}_k$ for $k = 1, ..., r$. Furthermore, let $\ten{G}_2(\mat{\eta}_y)$ be a tensor of dimensions $p_1, \ldots, p_r$ such that
|
||||||
|
@ -569,43 +569,84 @@ A straightforward and general method for parameter estimation is \emph{gradient
|
||||||
If $\mat{T}_2$ is the identity matrix $\mat{I}_{p(p + 1) / 2}$, then $\ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y)$.
|
If $\mat{T}_2$ is the identity matrix $\mat{I}_{p(p + 1) / 2}$, then $\ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y)$.
|
||||||
\end{theorem}
|
\end{theorem}
|
||||||
|
|
||||||
\begin{algorithm}[hpt!]
|
Although the general case of any GMLM model can be fitted via gradient descent using \cref{thm:grad}, this may be very inefficient. In case of the tensor normal distribution an iterative cyclic updating scheme can be derived by setting the partial gradients of \cref{thm:grad} to zero. Described in more details in \cref{sec:tensor_normal_estimation}, this method has much faster convergence, is stabel and does not requires any hyper parameters. On the other hand, the Ising model does not allow such a scheme. There we need to use a gradient based method, which is the subject of \cref{sec:ising_estimation}.
|
||||||
\caption{\label{alg:NAGD}Nesterov Accelerated Gradient Descent}
|
|
||||||
\begin{algorithmic}[1]
|
|
||||||
\State Objective: $l_n(\mat{\theta})$
|
|
||||||
\State Arguments: Order $r + 1$ tensors $\ten{X}$, $\ten{F}$ \todo{ask Effie about input description}
|
|
||||||
\State Initialize: Parameters $\mat{\theta}^{(0)}$, $0 < \delta^{(1)}$ and $0 < \gamma < 1$
|
|
||||||
\\
|
|
||||||
\State $t \leftarrow 1$
|
|
||||||
\Comment{step counter}
|
|
||||||
\State $\mat{\theta}^{(1)} \leftarrow \mat{\theta}^{(0)}$
|
|
||||||
\Comment{artificial first step}
|
|
||||||
\State $(m^{(0)}, m^{(1)}) \leftarrow (0, 1)$
|
|
||||||
\Comment{momentum extrapolation weights}
|
|
||||||
\\
|
|
||||||
\Repeat \Comment{repeat untill convergence}
|
|
||||||
\State $\mat{M} \leftarrow \mat{\theta}^{(t)} + \frac{m^{(t - 1)} - 1}{m^{(t)}}(\mat{\theta}^{(t)} - \mat{\theta}^{(t - 1)})$ \Comment{momentum extrapolation}
|
|
||||||
\For{$\delta = \gamma^{-1}\delta^{(t)}, \delta^{(t)}, \gamma\delta^{(t)}, \gamma^2\delta^{(t)}, \ldots$} \Comment{line search}
|
|
||||||
\State $\mat{\theta} \leftarrow \mat{M} + \delta \nabla l_n(\mat{M})$
|
|
||||||
\If{$l_n(\mat{\theta}) \leq l_n(\mat{\theta}^{(t - 1)}) - \delta \|\nabla_{\mat{\theta}} l_n(\mat{M})\|_F^2$} \Comment{Armijo condition}
|
|
||||||
\State $\mat{\theta}^{(t + 1)} \leftarrow \mat{\theta}$
|
|
||||||
\State $\delta^{(t + 1)} \leftarrow \delta$
|
|
||||||
\Break
|
|
||||||
\EndIf
|
|
||||||
\EndFor
|
|
||||||
\State $m^{(t + 1)} \leftarrow \frac{1 + \sqrt{1 + (2 m^{(t)})^2}}{2}$ \Comment{update extrapolation weights}
|
|
||||||
\State $t \leftarrow t + 1$
|
|
||||||
\Until converged
|
|
||||||
\end{algorithmic}
|
|
||||||
\end{algorithm}
|
|
||||||
|
|
||||||
\todo{Update the following, especially check that $\ten{X}$ needs to be center (as in the current version implicitly assumed)}
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
\cref{alg:NAGD} requires initial parameter estimates $\mat{\theta}^{(0)}$. Therefore, we choose a simple heuristic approach by starting with $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$ and $\overline{\ten{\eta}}_1^{(0)}$ dependent on the particular distribution. For example for a Tensor Normal distribution the sample mean is a good choice while for the Ising model zero initialization has nice properties. The interesting part are the multi-linear predictors $\mat{\beta}_k$. Therefore, let $\mat{\Sigma}_k = \frac{q_k}{n q}\sum_{i = 1}^n {\ten{F}_{y_i}}_{(k)}\t{{\ten{F}_{y_i}}_{(k)}}$ and $\mat{\Delta}_k = \frac{p_k}{n p}\sum_{i = 1}^n {\ten{X}_{i}}_{(k)}\t{{\ten{X}_{i}}_{(k)}}$ mode wise sample covariance estimates. For both we take the $s_k = \min(p_k, q_k)$ order SVD approximation as $\mat{\Delta}_k \approx \mat{U}_k\mat{D}_k\t{\mat{U}_k}$ and $\mat{\Sigma}_k \approx \mat{V}_k\mat{S}_k\t{\mat{V}_k}$. The approximate SVD gives us the $s_k$ first singular vectors with dimensions $\mat{U}_k(p_k\times s_k), \mat{D}_k(s_k\times s_k)$ and $\mat{V}_k(q_k\times s_k), \mat{S}_k(s_k\times s_k)$. Then we initialize
|
\subsection{Tensor Normal}\label{sec:tensor_normal_estimation}
|
||||||
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
|
With $\ten{X}\mid Y = y$ following a tensor normal distribution, its density, with mean $\ten{\mu}_y$ and covariance $\mat{\Sigma} = \bigkron_{k = r}^{1}\mat{\Sigma}_k$ is given by
|
||||||
\begin{displaymath}
|
\begin{displaymath}
|
||||||
\mat{\beta}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\sqrt{\mat{S}_k}\t{\mat{V}_k}.
|
f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p / 2}\prod_{k = 1}^{r}\det(\mat{\Sigma}_k)^{-p / 2 p_k}\exp\left( -\frac{1}{2}\left\langle\ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k = 1}^{r}\mat{\Sigma}_k^{-1} \right\rangle \right).
|
||||||
|
\end{displaymath}
|
||||||
|
We assume the distribution is non-degenerate which means that the covariances $\mat{\Sigma}_k$ are symmetric positive definite matrices. For the sake of simplicity, we w.l.o.g. assume $\ten{X}$ to have marginal expectation equal zero, $\E\ten{X} = 0$. Rewriting this in the quadratic exponential family form \eqref{eq:quadratic-exp-fam}, determines tha scaling constant $c = -1/2$, and give the relation to the GMLM parameters $\overline{\ten{\eta}}, \mat{\beta}_k$ and $\mat{\Omega}_k$, for $k = 1, \ldots, r$ as
|
||||||
|
\begin{displaymath}
|
||||||
|
\ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k, \qquad
|
||||||
|
\mat{\Omega}_k = \mat{\Sigma}_k^{-1}
|
||||||
|
\end{displaymath}
|
||||||
|
where we used that $\overline{\ten{\eta}} = 0$ due to $0 = \E\ten{X} = \E\E[\ten{X}\mid Y] = \E\ten{\mu}_Y$ in combination with $\E\ten{F}_Y = 0$. Additionally, all the $\mat{\Omega}_k$'s are symmetric positive definite, because the $\mat{\Sigma}_k$'s are. This lead to another simplification the matrix $\mat{T}_2$ from \eqref{eq:t-stat} is equal to the identity. This also means that the gradients of the log-likelihood $l_n$ in \cref{thm:grad} are simpler as well. We get
|
||||||
|
\begin{displaymath}
|
||||||
|
\ten{g}_1(\mat{\eta}_y) = \E[\ten{X}\mid Y = y] = \ten{\mu}_y, \qquad
|
||||||
|
\ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y) = \E[\ten{X}\circ\ten{X}\mid Y = y] \equiv \bigkron_{k = r}^1\mat{\Sigma}_k + (\vec{\ten{\mu}}_y)\t{(\vec{\ten{\mu}}_y)}.
|
||||||
\end{displaymath}
|
\end{displaymath}
|
||||||
|
|
||||||
The default hyper-parameters in \cref{alg:NAGD} are set to $\delta^{(1)} = 1 / 100$ for the initial step size and the line search step parameter $\gamma = 2 / (1 + \sqrt{5})$ is the inverse golden ratio.
|
In practice, we start the estimation process by demeaning a given data set $(\ten{X}_i, \ten{F}_{y_i})$ with $i = 1, \ldots, n$ observations. After demenaing, only the reduction matrices $\mat{\beta}_k$ and the scatter matrices $\mat{\Omega}_k$ need to be estimated. Then, to solve the optimization problem \eqref{eq:mle}, with $\overline{\ten{\eta}} = 0$ after demeaning, we need some initial values. For initial estimates $\hat{\mat{\beta}}_k^{(0)}$ we use a simple heuristic approach. First, we compute moment based mode-wise marginal covariance estimates $\widehat{\mat{\Sigma}}_k(\ten{X})$ and $\widehat{\mat{\Sigma}}_k(\ten{F}_Y)$ given by
|
||||||
|
\begin{displaymath}
|
||||||
|
\widehat{\mat{\Sigma}}_k(\ten{X}) = \frac{1}{n}\sum_{i = 1}^{n} (\ten{X}_i)_{(k)}\t{(\ten{X}_i)_{(k)}}, \qquad
|
||||||
|
\widehat{\mat{\Sigma}}_k(\ten{F}_Y) = \frac{1}{n}\sum_{i = 1}^{n} (\ten{F}_{y_i})_{(k)}\t{(\ten{F}_{y_i})_{(k)}}.
|
||||||
|
\end{displaymath}
|
||||||
|
Then, for every mode $k = 1, \ldots, r$, we compute the first $j = 1, \ldots, q_k$ eigen vectors $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{F}_Y))$ and eigen values $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$ of the marginal covariance estimates. Let,
|
||||||
|
\begin{align*}
|
||||||
|
\mat{U}_k &= (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))), \\
|
||||||
|
\mat{D}_k &= \diag(\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X}))\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_{Y})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))\mat{v}_{q_k}(\widehat{\mat{\Sigma}}_k(\ten{F}_{Y}))), \\
|
||||||
|
\mat{V}_k &= (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_Y), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{F}_Y)). \\
|
||||||
|
\end{align*}
|
||||||
|
The initial estimates are then set to
|
||||||
|
\begin{displaymath}
|
||||||
|
\hat{\mat{\beta}}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\t{\mat{V}_k}.
|
||||||
|
\end{displaymath}
|
||||||
|
The initial values of $\mat{\Omega}_k$ are set to the identity $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$.
|
||||||
|
|
||||||
|
Given any estimates $\hat{\mat{\beta}}_1, \ldots, \hat{\mat{\beta}}_r, \hat{\mat{\Omega}}_1, \ldots, \hat{\mat{\Omega}}_r$, we take the gradient $\nabla_{\mat{\beta}_j}$ of the log-likelihood $l_n$ in \eqref{eq:log-likelihood} of the tensor normal distributed from \cref{thm:grad} and keep all other parameters except $\mat{\beta}_j$ fixed. Then, $\nabla_{\mat{\beta}_j}l_n = 0$ has a closed form solution
|
||||||
|
\begin{equation}\label{eq:tensor_normal_beta_solution}
|
||||||
|
\t{\mat{\beta}_j} = \biggl(
|
||||||
|
\sum_{i = 1}^{n}
|
||||||
|
\Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\Omega}}_k^{-1}\hat{\mat{\beta}}_k \Bigr)_{(j)}
|
||||||
|
\t{\Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\beta}}_k \Bigr)_{(j)}}
|
||||||
|
\biggr)^{-1}
|
||||||
|
\biggl(
|
||||||
|
\sum_{i = 1}^{n}
|
||||||
|
\Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\beta}}_k \Bigr)_{(j)}
|
||||||
|
\t{(\ten{X}_{i})_{(j)}}
|
||||||
|
\biggr)
|
||||||
|
\hat{\mat{\Omega}}_j.
|
||||||
|
\end{equation}
|
||||||
|
For the scatter matrices $\mat{\Omega}_j$, we need to fudge a bit. Equating the partial gradient of the $j$'th scatter matrix $\mat{\Omega}_j$ in \cref{thm:grad} to zero, $\nabla_{\mat{\Omega}_j}l_n = 0$, gives a quadratic matrix equation. This is due to the dependence of $\ten{\mu}_y$ on $\mat{\Omega}_j$. In practive thoug, it is more stable, faster and equaly accurate to use mode-wise covariance estimates via the residuals
|
||||||
|
\begin{displaymath}
|
||||||
|
\hat{\ten{R}}_i = \ten{X}_i - \hat{\ten{\mu}}_{y_i} = \ten{X}_i - \ten{F}_{y_i}\mlm_{k = 1}^{r}\hat{\mat{\Omega}}_k^{-1}\hat{\mat{\beta}}_k.
|
||||||
|
\end{displaymath}
|
||||||
|
The estimates are computed via the inbetween values
|
||||||
|
\begin{displaymath}
|
||||||
|
\tilde{\mat{\Sigma}}_j = \sum_{i = 1}^n (\hat{\ten{R}}_i)_{(j)} \t{(\hat{\ten{R}}_i)_{(j)}}
|
||||||
|
\end{displaymath}
|
||||||
|
which only lacks scaling $\tilde{s}\tilde{\mat{\Sigma}}_j = \hat{\mat{\Omega}}_j^{-1}$. The scaling condition is that the mean squared error has to be equal to the trace of the covariance estimate,
|
||||||
|
\begin{displaymath}
|
||||||
|
\frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle = \tr\bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k^{-1} = \prod_{k = 1}^{r}\tr{\hat{\mat{\Omega}}_k^{-1}} = \tilde{s}^r\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k}.
|
||||||
|
\end{displaymath}
|
||||||
|
Solving for the scaling value gives
|
||||||
|
\begin{displaymath}
|
||||||
|
\tilde{s} = \biggl(\Bigl(\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k}\Bigr)^{-1}\frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle\biggr)^{1 / r}
|
||||||
|
\end{displaymath}
|
||||||
|
resulting in the estimates $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{-1}$.
|
||||||
|
|
||||||
|
Estimation is then performed by updating for $j = 1, \ldots, r$ the estimates $\hat{\mat{\beta}}_j$ via \eqref{eq:tensor_normal_beta_solution}, then we recompute the $\hat{\mat{\Omega}}_j$ estimates simultaniously keeping the $\hat{\mat{\beta}}_j$'s fixed. This procedure is repeated untill convergence. Convergence is very fast, experiments showed that convergence occures usualy in less than $10$ iterations.
|
||||||
|
|
||||||
|
A technical detail for numerical stability is to ensure that the scaled inbetween values $\tilde{s}\tilde{\mat{\Sigma}}_j$, assumed to be symmetric and positive definite, are well conditioned. Otherwise, the procedure can be numerically unstable by attempting to compute $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{-1}$. Therefore, we estimate the condition number of $\tilde{s}\tilde{\mat{\Sigma}}_j$ before computing the inverse. In case of being ill conditioned, we regularize and use $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j + 0.2 \lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)\mat{I}_{p_j})^{-1}$ instead, where $\lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)$ is the first (maximum) eigen value. Experiments showed that this regularization is usually required in the first few iterations only.
|
||||||
|
|
||||||
|
Furthermore, we may assume that the parameter space follows a more generel setting as in \cref{thm:param-manifold}. In this case, updating may produces estimates outside of the parameter space. A simple and efficient method is to project every updated estimate onto the corresponding manifold.
|
||||||
|
|
||||||
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
|
\subsection{Ising Model}\label{sec:ising_estimation}
|
||||||
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
|
|
||||||
|
|
||||||
\newpage
|
\newpage
|
||||||
|
@ -674,6 +715,8 @@ As a basis to ensure that the constraint parameter space $\Theta$ is a manifold,
|
||||||
then the constrained parameter space $\Theta = \mathbb{R}^p \times \manifold{K}_{\mat{B}}\times\manifold{CK}_{\mat{\Omega}}\subset\mathbb{R}^{p(p + 2 q + 3) / 2}$ is a smooth embedded manifold.
|
then the constrained parameter space $\Theta = \mathbb{R}^p \times \manifold{K}_{\mat{B}}\times\manifold{CK}_{\mat{\Omega}}\subset\mathbb{R}^{p(p + 2 q + 3) / 2}$ is a smooth embedded manifold.
|
||||||
\end{theorem}
|
\end{theorem}
|
||||||
|
|
||||||
|
\subsection{Matrix Manifolds}\label{sec:matrix-manifolds}
|
||||||
|
|
||||||
{\color{red}
|
{\color{red}
|
||||||
Flexibility of Manifold Structure
|
Flexibility of Manifold Structure
|
||||||
|
|
||||||
|
@ -716,55 +759,6 @@ It is not necessary to have a perfect maximizer, as long as the objective has fi
|
||||||
with asymptotic variance-covariance structure $\mat{\Sigma}_{\mat{\theta}_0}$ given in \eqref{eq:asymptotic-covariance-gmlm}.
|
with asymptotic variance-covariance structure $\mat{\Sigma}_{\mat{\theta}_0}$ given in \eqref{eq:asymptotic-covariance-gmlm}.
|
||||||
\end{theorem}
|
\end{theorem}
|
||||||
|
|
||||||
% \begin{theorem}[{\textcite[Thm~5.35]{asymStats-van_der_Vaart1998}}]
|
|
||||||
% Let $\{ p_{\mat{\theta}} : \mat{\theta}\in\Theta \}$ be a collection of identifiable subprobability\footnote{A subprobability measure is a measure $\nu$ for the measure space $(\mathfrak{X}, \nu)$ such that $\nu(\mathfrak{X}) \leq 1$.} densities such that $P_{\mat{\theta}_0}$ is a probability measure. Then $M(\mat{\theta}) = P_{\mat{\theta}_0}\log p_{\mat{\theta}} / p_{\mat{\theta}_0}$ attains its maximum uniquely at $\mat{\theta}_0$.
|
|
||||||
% \end{theorem}
|
|
||||||
|
|
||||||
% \begin{proof}[Proof of \cref{thm:asymptotic-normality-gmlm}]
|
|
||||||
% First we prove the consistency for which we apply \cref{thm:consistency-on-metric-spaces} and then prove that there
|
|
||||||
|
|
||||||
% Most parts of the proof are identical to the proof \textcite[Thm~3.1]{asymptoticMLE-BuraEtAl2018} with addaptations in a few places.
|
|
||||||
|
|
||||||
% {\color{green}Bla Bla Bla ...}
|
|
||||||
|
|
||||||
% \todo{regularity conditions!}
|
|
||||||
|
|
||||||
% By rewriting \eqref{eq:eta1} as
|
|
||||||
% \begin{displaymath}
|
|
||||||
% \mat{\eta}_{1y} =
|
|
||||||
% \vec{\overline{\ten{\eta}}} + \mat{B}\vec{\ten{F}_y} = (\vec{\overline{\ten{\eta}}}, \mat{B})\begin{pmatrix}
|
|
||||||
% 1 \\ \vec{\ten{F}_y}
|
|
||||||
% \end{pmatrix} = (\mat{I}_p\otimes \t{(1, \vec{\ten{F}_y})})\vec((\vec{\overline{\ten{\eta}}}, \mat{B}))
|
|
||||||
% \end{displaymath}
|
|
||||||
% the natural parameter $\mat{\eta}_y$ with components \eqref{eq:eta1}, \eqref{eq:eta2} can be written with $\mat{\theta} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vec{\mat{\Omega}})$ as
|
|
||||||
% \begin{displaymath}
|
|
||||||
% \mat{\eta}_y = \begin{pmatrix}
|
|
||||||
% \mat{I}_p\otimes \t{(1, \vec{\ten{F}_y})} & 0 \\
|
|
||||||
% 0 & c\t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}
|
|
||||||
% \end{pmatrix}\mat{\theta} = \mat{F}(y)\mat{\theta}
|
|
||||||
% \end{displaymath}
|
|
||||||
% Using the new form, let with $z = (\ten{X}, y)$
|
|
||||||
% \begin{displaymath}
|
|
||||||
% m_{\mat{\theta}}(z) = \langle \mat{F}(y)\mat{\theta}, \mat{t}(\ten{X}) \rangle - b(\mat{F}(y)\mat{\theta})
|
|
||||||
% \end{displaymath}
|
|
||||||
% which allows to write the log-likelihood \eqref{eq:log-likelihood}, scaled by $n^{-1}$, with $Z_i = (\ten{X}_i, Y_i)$, as
|
|
||||||
% \begin{displaymath}
|
|
||||||
% M_n(\mat{\theta}) = \frac{1}{n}\sum_{i = 1}^{n} m_{\mat{\theta}}(Z_i) = \frac{1}{n} l_n(\mat{\theta}).
|
|
||||||
% \end{displaymath}
|
|
||||||
|
|
||||||
% The strict convexity of $b$ ensures that $\mat{\theta}\mapsto m_{\mat{\theta}}(z)$ is strictly concave for every fixed $z$. Taking expectation preserves the concavity yielding $M(\mat{\theta}) = \E m_{\mat{\theta}}(Z)$ to be concave aswell. The identifiability \todo{???} of \todo{???} allows to apply \textcite[Thm~5.35]{asymStats-van_der_Vaart1998} which gives for the family of conditional densities $\{ f_{\mat{\theta}}(\,.\mid Y) : \mat{\theta}\in\Theta \}$ that
|
|
||||||
% \begin{displaymath}
|
|
||||||
% \E[m_{\mat{\theta}}(Z)\mid Y] \leq \E[m_{\mat{\theta}_0}(Z)\mid Y].
|
|
||||||
% \end{displaymath}
|
|
||||||
% Taking the expectation with respect to $Y$ gives $M(\mat{\theta}) = \E[m_{\mat{\theta}}(Z)] \leq \E[m_{\mat{\theta}_0}(Z)] = M(\mat{\theta}_0)$ for every $\mat{\theta}$. Now, assume their exists a $\mat{\theta}_1$ such that $M(\mat{\theta}_1) = M(\mat{\theta}_0)$, then $P(\mat{F}(Y)\mat{\theta}_1 = \mat{F}(Y)\mat{\theta}_0) = 1$ which means \todo{figure that out why} which contradicts \todo{$P(\mat{F}(Y)\mat{\theta} = \overline{\ten{\eta}}) = 1$ which is a violaten of an assumption.} Finally, with the integrability assumtion we can applying \textcite[Thm~A.1]{asymptoticMLE-BuraEtAl2018} which gives that the MLE $\hat{\mat{\theta}}_n$ of \eqref{eq:mle} exists and is unique.\todo{Fix that!!!}
|
|
||||||
|
|
||||||
|
|
||||||
% {\color{green}Foo Bar Baz ...}
|
|
||||||
|
|
||||||
% \todo{finish me!}
|
|
||||||
% \end{proof}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
\subsection{Asymptotic Normality}
|
\subsection{Asymptotic Normality}
|
||||||
|
@ -814,38 +808,96 @@ The following is a reformulation of \textcite[Lemma~2.3]{asymptoticMLE-BuraEtAl2
|
||||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
We compair our method with a few other methods for the tensor normal and the Ising model (inverse Ising problem).
|
We compair our method with a few other methods for the tensor normal and the Ising model (inverse Ising problem).
|
||||||
|
|
||||||
\subsection{Tensor Normal Distribution}
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
The \emph{tensor normal distribution} $\mathcal{TN}(\ten{\mu}, \mat{\Sigma}_1, ..., \mat{\Sigma}_r)$ is a generalization of the \emph{matrix normal distribution} $\mathcal{MN}(\mat{\mu}, \mat{\Sigma}_1, \mat{\Sigma}_2)$. The density of the conditional tensor normal distribution $\ten{X}\mid Y = y$ is given by
|
\subsection{Tensor Normal}
|
||||||
\begin{displaymath}
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
f_{\mat{\theta}_y}(\ten{X}\mid Y = y) = (2\pi)^{-p/2}\prod_{k = 1}^r |\mat{\Sigma}_{k}|^{-p / 2 p_{k}}\exp\Big(
|
|
||||||
-\frac{1}{2}\Big\langle \ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k\in[r]}\mat{\Sigma}_{k} \Big\rangle
|
|
||||||
\Big)
|
|
||||||
\end{displaymath}
|
|
||||||
|
|
||||||
\subsection{Ising Model}
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item[a] A simple setup with linear relation for
|
||||||
|
\item[b]
|
||||||
|
\item[c]
|
||||||
|
\item[d]
|
||||||
|
\item[e] Missspecified model .......
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
\begin{figure}
|
\begin{figure}
|
||||||
\centering
|
\centering
|
||||||
\begin{scaletikzpicturetowidth}{0.5\textwidth}
|
\includegraphics[width = \textwidth]{plots/sim-normal.pdf}
|
||||||
\input{plots/sim-normal.tex}
|
\caption{\label{fig:sim-normal}asknclknasknc}
|
||||||
\end{scaletikzpicturetowidth}
|
|
||||||
\caption{\label{fig:sim-normal}As a test case we show the $3$-tensor normal (generalization of the matrix normal)}
|
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
|
|
||||||
\subsection{Matrix Normal and Ising Model}
|
|
||||||
|
|
||||||
If $\mat{X} \in \mathbb{R}^{p_1 \times p_2}$, then $\mat{\theta} = (\overline{\eta}, \mat{\beta}_1, \mat{\beta}_2, \mat{\Omega}_1, \mat{\Omega}_2)$ and $\mat{F}_y$ is also matrix valued. The conditional pdf of $\mat{X}\mid Y$ is
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
\begin{align*}
|
\subsection{Ising Model}
|
||||||
f_{\mat{\theta}}(\mat{X}\mid Y = y)
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
&= h(\mat{X})\exp(\langle\mat{X}, \mat{\eta}_1(\mat{\theta})\rangle + \langle\mat{X}\circ\mat{X}, \mat{\eta}_2(\mat{\theta})\rangle - b(\mat{\eta}_y(\mat{\theta}))) \\
|
|
||||||
&= h(\mat{X})\exp(\tr((\overline{\mat{\eta}} + \mat{\beta}_1\mat{F}_y\t{\mat{\beta}_2})\t{\mat{X}}) + \tr(\mat{\Omega}_1\mat{X}\mat{\Omega}_2\t{\mat{X}}) - b(\mat{\eta}_y(\mat{\theta}))).
|
\begin{figure}
|
||||||
\end{align*}
|
\centering
|
||||||
The MLE estimate $\hat{\mat{\theta}}_n = (\widehat{\overline{\mat{\eta}}}, \widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1, \widehat{\mat{\Omega}}_2\otimes \widehat{\mat{\Omega}}_2)$ is asymptotically normal and
|
\includegraphics[]{plots/sim-ising.pdf}
|
||||||
\begin{displaymath}
|
\caption{\label{fig:sim-ising}asknclknasknc}
|
||||||
\widehat{\ten{R}}(\mat{X}) = \t{(\widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1)}\vec(\mat{X} - \E\mat{X}) \equiv \t{\widehat{\mat{\beta}}_1}(\mat{X} - \E\mat{X})\widehat{\mat{\beta}}_2
|
\end{figure}
|
||||||
\end{displaymath}
|
|
||||||
is the MLE of the sufficient reduction $\ten{R}(\mat{X})$ of dimension $q_1 \times q_2 \leq p_1 \times p_2 $.\\[1.6em]
|
\begin{table}
|
||||||
|
\begin{tabular}{c | ccc ccc c}
|
||||||
|
$n$ & GMLM & PCA & HOPCA & LPCA & CLPCA & TSIR & MGCCA \\
|
||||||
|
\hline
|
||||||
|
100 & {\bf 0.34} (0.14) & 0.90 (0.04) & 0.90 (0.05) & 0.94 (0.09) & 0.91 (0.03) & 0.48 (0.19) & 0.55 (0.13) \\
|
||||||
|
200 & {\bf 0.25} (0.11) & 0.90 (0.03) & 0.90 (0.03) & 0.96 (0.07) & 0.91 (0.02) & 0.38 (0.16) & 0.53 (0.10) \\
|
||||||
|
300 & {\bf 0.20} (0.09) & 0.89 (0.02) & 0.89 (0.02) & 0.97 (0.06) & 0.91 (0.02) & 0.29 (0.13) & 0.51 (0.11) \\
|
||||||
|
500 & {\bf 0.16} (0.07) & 0.90 (0.02) & 0.90 (0.02) & 0.98 (0.01) & 0.91 (0.01) & 0.23 (0.10) & 0.50 (0.08) \\
|
||||||
|
750 & {\bf 0.13} (0.05) & 0.90 (0.01) & 0.90 (0.01) & 0.98 (0.02) & 0.91 (0.01) & 0.23 (0.08) & 0.53 (0.06)
|
||||||
|
\end{tabular}
|
||||||
|
\caption{\label{tab:sim-ising}xyz uvw}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
% The \emph{tensor normal distribution} $\mathcal{TN}(\ten{\mu}, \mat{\Sigma}_1, ..., \mat{\Sigma}_r)$ is a generalization of the \emph{matrix normal distribution} $\mathcal{MN}(\mat{\mu}, \mat{\Sigma}_1, \mat{\Sigma}_2)$. \todo{ref} The density of the conditional tensor normal distribution $\ten{X}\mid Y = y$ according to the quadratic exponential family \eqref{eq:quadratic-exp-fam} where only the first moment depends on $y$ is given by
|
||||||
|
% \begin{displaymath}
|
||||||
|
% f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p/2}\prod_{k = 1}^r |\mat{\Sigma}_{k}|^{-p / 2 p_{k}}\exp\Big(
|
||||||
|
% -\frac{1}{2}\Big\langle \ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k\in[r]}\mat{\Sigma}_{k} \Big\rangle
|
||||||
|
% \Big)
|
||||||
|
% \end{displaymath}
|
||||||
|
% Rewriting this in the form of an exponential family as in \eqref{eq:quadratic-exp-fam} allows to determin the natural parameter components $\mat{\eta}_{y1}$ and $\mat{\eta}_2$. Since a non-degenerate normal distribution requires the covariance matrices $\mat{\Sigma}_k$ to be symmetric positive definite the relation to the second moment natural parameter $\mat{\eta}_2$ simplifies as we can set $\mat{T}_2$ in \eqref{eq:eta2-manifold} to the identity. This then gives the relation to the natural parameters as in \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold} as
|
||||||
|
% \begin{displaymath}
|
||||||
|
% \mat{\eta}_{1y} = \vec\Bigl(\ten{\mu}_y\mlm_{k = 1}^{r}\mat{\Sigma}_k\Bigr), \qquad
|
||||||
|
% \mat{\eta}_2 = c\t{\mat{D}_p}\vec\bigkron_{k = r}^{1}\mat{\Sigma}_k^{-1}
|
||||||
|
% \end{displaymath}
|
||||||
|
% with scaling constant $c = -1 / 2$. Modeling the natural parameters as in \eqref{eq:eta1} and \eqref{eq:eta2} relates the mean $\ten{\mu}_y$ and the covariance matrices $\mat{\Sigma}_k$ of the tensor normal to the generalized multi-linear model parameter $\overline{\ten{\eta}}$ and $\mat{\beta}_k$, $\mat{\Omega}_k$, for $k = 1, \ldots, r$ through
|
||||||
|
% \begin{displaymath}
|
||||||
|
% \ten{\mu}_y = \Bigl(\overline{\ten{\eta}} + \ten{F}_y\mlm_{j = 1}^{r}\mat{\beta}_j\Bigr)\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}, \qquad \mat{\Omega}_k = \mat{\Sigma}_k^{-1}.
|
||||||
|
% \end{displaymath}
|
||||||
|
% This completely determines the tensor normal distribution given the GMLM parameter.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
% To estimate the GMLM parameters $\mat{\theta} = (\overline{\ten{\eta}}, \mat{\beta}_1, \ldots, \mat{\beta}_r, \mat{\Omega}_1, \ldots\mat{\Omega}_r)$ given a data set $(\ten{X}_i, y_i)$ of $i = 1, \ldots, n$ observation we use the gradients provided by \cref{thm:grad}. It turns out that the equations $\nabla_{\overline{\ten{\eta}}}l_n = 0, \nabla_{\mat{\beta}_j}l_n = 0$ and $\nabla_{\mat{\Omega}_j}l_n = 0$, for $j = 1, \ldots, r$, can be solved for the differentiation variable assuming all the other parameter blocks to be constant. Centering the observed $\ten{X}$ leads to a cleaner formulation given by \todo{fix the following!}
|
||||||
|
% \begin{align*}
|
||||||
|
% \hat{\overline{\ten{\eta}}} &= \frac{1}{n}\sum_{i = 1}^{n} \ten{X}_i, \\
|
||||||
|
% \t{\hat{\mat{\beta}}_j} &= \mat{\Omega}_j \Bigl(\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\Omega}_k^{-1}\mat{\beta}_k\Bigr)_{(j)}\t{\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\beta}_k\Bigr)_{(j)}}\Bigr)^{-1}\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\beta}_k\Bigr)_{(j)}\ten{X}_{(j)}, \\
|
||||||
|
% \t{\Omega}_j &= scaling \frac{1}{n}
|
||||||
|
% \end{align*}
|
||||||
|
|
||||||
|
% This allows to use a \emph{block coordinate descent} method instead of gradient descent. This method keeps all but one parameter block fixed and optimized the objective for a single block. Given a closed form solution for the partial gradients, only a single update is required to solve the partial optimization problem. This means that the block coordinate descent method reduces to a cyclic updating. This not only converges very fast, it also does not require any hyper parameters.
|
||||||
|
|
||||||
|
% For this iterative scheme we do need some initial estimates. For those we
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
% \subsection{Matrix Normal and Ising Model}
|
||||||
|
|
||||||
|
% If $\mat{X} \in \mathbb{R}^{p_1 \times p_2}$, then $\mat{\theta} = (\overline{\eta}, \mat{\beta}_1, \mat{\beta}_2, \mat{\Omega}_1, \mat{\Omega}_2)$ and $\mat{F}_y$ is also matrix valued. The conditional pdf of $\mat{X}\mid Y$ is
|
||||||
|
% \begin{align*}
|
||||||
|
% f_{\mat{\theta}}(\mat{X}\mid Y = y)
|
||||||
|
% &= h(\mat{X})\exp(\langle\mat{X}, \mat{\eta}_1(\mat{\theta})\rangle + \langle\mat{X}\circ\mat{X}, \mat{\eta}_2(\mat{\theta})\rangle - b(\mat{\eta}_y(\mat{\theta}))) \\
|
||||||
|
% &= h(\mat{X})\exp(\tr((\overline{\mat{\eta}} + \mat{\beta}_1\mat{F}_y\t{\mat{\beta}_2})\t{\mat{X}}) + \tr(\mat{\Omega}_1\mat{X}\mat{\Omega}_2\t{\mat{X}}) - b(\mat{\eta}_y(\mat{\theta}))).
|
||||||
|
% \end{align*}
|
||||||
|
% The MLE estimate $\hat{\mat{\theta}}_n = (\widehat{\overline{\mat{\eta}}}, \widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1, \widehat{\mat{\Omega}}_2\otimes \widehat{\mat{\Omega}}_2)$ is asymptotically normal and
|
||||||
|
% \begin{displaymath}
|
||||||
|
% \widehat{\ten{R}}(\mat{X}) = \t{(\widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1)}\vec(\mat{X} - \E\mat{X}) \equiv \t{\widehat{\mat{\beta}}_1}(\mat{X} - \E\mat{X})\widehat{\mat{\beta}}_2
|
||||||
|
% \end{displaymath}
|
||||||
|
% is the MLE of the sufficient reduction $\ten{R}(\mat{X})$ of dimension $q_1 \times q_2 \leq p_1 \times p_2 $.\\[1.6em]
|
||||||
|
|
||||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
\appendix
|
\appendix
|
||||||
|
|
|
@ -0,0 +1,151 @@
|
||||||
|
%%% R code to generate the input data files from corresponding simulation logs
|
||||||
|
% R> setwd("~/Work/tensorPredictors")
|
||||||
|
% R>
|
||||||
|
% R> for (sim.name in c("2a")) {
|
||||||
|
% R> pattern <- paste0("sim\\_", sim.name, "\\_ising\\-[0-9T]*\\.csv")
|
||||||
|
% R> log.file <- sort(
|
||||||
|
% R> list.files(path = "sim/", pattern = pattern, full.names = TRUE),
|
||||||
|
% R> decreasing = TRUE
|
||||||
|
% R> )[[1]]
|
||||||
|
% R>
|
||||||
|
% R> sim <- read.csv(log.file)
|
||||||
|
% R>
|
||||||
|
% R> aggr <- aggregate(sim[, names(sim) != "sample.size"], list(sample.size = sim$sample.size), mean)
|
||||||
|
% R>
|
||||||
|
% R> write.table(aggr, file = paste0("LaTeX/plots/aggr-", sim.name, "-ising.csv"), row.names = FALSE, quote = FALSE)
|
||||||
|
% R> }
|
||||||
|
\documentclass[border=0cm]{standalone}
|
||||||
|
|
||||||
|
\usepackage{tikz}
|
||||||
|
\usepackage{pgfplots}
|
||||||
|
\usepackage{bm}
|
||||||
|
|
||||||
|
\definecolor{gmlm}{RGB}{0,0,0}
|
||||||
|
\definecolor{mgcca}{RGB}{86,180,233}
|
||||||
|
\definecolor{tsir}{RGB}{0,158,115}
|
||||||
|
\definecolor{pca}{RGB}{240,228,66}
|
||||||
|
\definecolor{hopca}{RGB}{230,159,0}
|
||||||
|
\definecolor{lpca}{RGB}{127,127,127}
|
||||||
|
\definecolor{clpca}{RGB}{191,191,191}
|
||||||
|
|
||||||
|
\pgfplotsset{
|
||||||
|
every axis/.style={
|
||||||
|
xtick={100,200,300,500,750},
|
||||||
|
ymin=-0.05, ymax=1.05,
|
||||||
|
grid=both,
|
||||||
|
grid style={gray, dotted}
|
||||||
|
},
|
||||||
|
every axis plot/.append style={
|
||||||
|
mark = *,
|
||||||
|
mark size = 1pt,
|
||||||
|
line width=0.8pt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
\tikzset{
|
||||||
|
legend entry/.style={
|
||||||
|
mark = *,
|
||||||
|
mark size = 1pt,
|
||||||
|
mark indices = {2},
|
||||||
|
line width=0.8pt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
\begin{tikzpicture}[>=latex]
|
||||||
|
|
||||||
|
\begin{axis}[
|
||||||
|
name=sim-2a
|
||||||
|
]
|
||||||
|
\addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1a-normal.csv};
|
||||||
|
\addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-2a-ising.csv};
|
||||||
|
\addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-2a-ising.csv};
|
||||||
|
\addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-2a-ising.csv};
|
||||||
|
\addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-2a-ising.csv};
|
||||||
|
\addplot[color = lpca] table[x = sample.size, y = dist.subspace.lpca] {aggr-2a-ising.csv};
|
||||||
|
\addplot[color = clpca] table[x = sample.size, y = dist.subspace.clpca] {aggr-2a-ising.csv};
|
||||||
|
\end{axis}
|
||||||
|
\node[anchor = base west, yshift = 0.3em] at (sim-2a.north west) {
|
||||||
|
a: small
|
||||||
|
};
|
||||||
|
|
||||||
|
% \begin{axis}[
|
||||||
|
% name=sim-1b,
|
||||||
|
% anchor=north west, at={(sim-2a.right of north east)}, xshift=0.1cm,
|
||||||
|
% xticklabel=\empty,
|
||||||
|
% ylabel near ticks, yticklabel pos=right
|
||||||
|
% ]
|
||||||
|
% \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1b-normal.csv};
|
||||||
|
% \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1b-normal.csv};
|
||||||
|
% \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1b-normal.csv};
|
||||||
|
% \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1b-normal.csv};
|
||||||
|
% \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1b-normal.csv};
|
||||||
|
% \end{axis}
|
||||||
|
% \node[anchor = base west, yshift = 0.3em] at (sim-1b.north west) {
|
||||||
|
% b: cubic dependence on $y$
|
||||||
|
% };
|
||||||
|
|
||||||
|
% \begin{axis}[
|
||||||
|
% name=sim-1c,
|
||||||
|
% anchor=north west, at={(sim-2a.below south west)}, yshift=-.8em,
|
||||||
|
% xticklabel=\empty
|
||||||
|
% ]
|
||||||
|
% \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1c-normal.csv};
|
||||||
|
% \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1c-normal.csv};
|
||||||
|
% \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1c-normal.csv};
|
||||||
|
% \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1c-normal.csv};
|
||||||
|
% \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1c-normal.csv};
|
||||||
|
% \end{axis}
|
||||||
|
% \node[anchor = base west, yshift = 0.3em] at (sim-1c.north west) {
|
||||||
|
% c: rank $1$ $\boldsymbol{\beta}$'s
|
||||||
|
% };
|
||||||
|
|
||||||
|
% \begin{axis}[
|
||||||
|
% name=sim-1d,
|
||||||
|
% anchor=north west, at={(sim-1c.right of north east)}, xshift=0.1cm,
|
||||||
|
% ylabel near ticks, yticklabel pos=right
|
||||||
|
% ]
|
||||||
|
% \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1d-normal.csv};
|
||||||
|
% \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1d-normal.csv};
|
||||||
|
% \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1d-normal.csv};
|
||||||
|
% \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1d-normal.csv};
|
||||||
|
% \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1d-normal.csv};
|
||||||
|
% \end{axis}
|
||||||
|
% \node[anchor = base west, yshift = 0.3em] at (sim-1d.north west) {
|
||||||
|
% d: tri-diagonal $\boldsymbol{\Omega}$'s
|
||||||
|
% };
|
||||||
|
|
||||||
|
% \begin{axis}[
|
||||||
|
% name=sim-1e,
|
||||||
|
% anchor=north west, at={(sim-1c.below south west)}, yshift=-.8em
|
||||||
|
% ]
|
||||||
|
% \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1e-normal.csv};
|
||||||
|
% \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1e-normal.csv};
|
||||||
|
% \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1e-normal.csv};
|
||||||
|
% \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1e-normal.csv};
|
||||||
|
% \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1e-normal.csv};
|
||||||
|
% \end{axis}
|
||||||
|
% \node[anchor = base west, yshift = 0.3em] at (sim-1e.north west) {
|
||||||
|
% e: missspecified
|
||||||
|
% };
|
||||||
|
|
||||||
|
|
||||||
|
\matrix[anchor = west] at (sim-2a.right of east) {
|
||||||
|
\draw[color=gmlm, legend entry, line width=1pt] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {GMLM}; \\
|
||||||
|
\draw[color=tsir, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {TSIR}; \\
|
||||||
|
\draw[color=mgcca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {MGCCA}; \\
|
||||||
|
\draw[color=hopca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {HOPCA}; \\
|
||||||
|
\draw[color=pca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {PCA}; \\
|
||||||
|
\draw[color=lpca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {LPCA}; \\
|
||||||
|
\draw[color=clpca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {CLPCA}; \\
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
\node[anchor = north] at (current bounding box.south) {Sample Size $n$};
|
||||||
|
|
||||||
|
\node[anchor = south, rotate = 90] at (current bounding box.west) {Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$};
|
||||||
|
\node[anchor = south, rotate = 270] at (current bounding box.east) {\phantom{Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$}};
|
||||||
|
|
||||||
|
\node[anchor = south, font=\large] at (current bounding box.north) {Tensor Normal Simulation};
|
||||||
|
|
||||||
|
\end{tikzpicture}
|
||||||
|
\end{document}
|
|
@ -0,0 +1,148 @@
|
||||||
|
%%% R code to generate the input data files from corresponding simulation logs
|
||||||
|
% R> setwd("~/Work/tensorPredictors")
|
||||||
|
% R>
|
||||||
|
% R> for (sim.name in c("1a", "1b", "1c", "1d", "1e")) {
|
||||||
|
% R> pattern <- paste0("sim\\_", sim.name, "\\_normal\\-[0-9T]*\\.csv")
|
||||||
|
% R> log.file <- sort(
|
||||||
|
% R> list.files(path = "sim/", pattern = pattern, full.names = TRUE),
|
||||||
|
% R> decreasing = TRUE
|
||||||
|
% R> )[[1]]
|
||||||
|
% R>
|
||||||
|
% R> sim <- read.csv(log.file)
|
||||||
|
% R>
|
||||||
|
% R> aggr <- aggregate(sim[, names(sim) != "sample.size"], list(sample.size = sim$sample.size), mean)
|
||||||
|
% R>
|
||||||
|
% R> write.table(aggr, file = paste0("LaTeX/plots/aggr-", sim.name, "-normal.csv"), row.names = FALSE, quote = FALSE)
|
||||||
|
% R> }
|
||||||
|
\documentclass[border=0cm]{standalone}
|
||||||
|
|
||||||
|
\usepackage{tikz}
|
||||||
|
\usepackage{pgfplots}
|
||||||
|
\usepackage{bm}
|
||||||
|
|
||||||
|
\definecolor{gmlm}{RGB}{0,0,0}
|
||||||
|
\definecolor{mgcca}{RGB}{86,180,233}
|
||||||
|
\definecolor{tsir}{RGB}{0,158,115}
|
||||||
|
\definecolor{hopca}{RGB}{230,159,0}
|
||||||
|
\definecolor{pca}{RGB}{240,228,66}
|
||||||
|
\definecolor{lpca}{RGB}{0,114,178}
|
||||||
|
\definecolor{clpca}{RGB}{213,94,0}
|
||||||
|
|
||||||
|
\pgfplotsset{
|
||||||
|
every axis/.style={
|
||||||
|
xtick={100,200,300,500,750},
|
||||||
|
ymin=-0.05, ymax=1.05,
|
||||||
|
grid=both,
|
||||||
|
grid style={gray, dotted}
|
||||||
|
},
|
||||||
|
every axis plot/.append style={
|
||||||
|
mark = *,
|
||||||
|
mark size = 1pt,
|
||||||
|
line width=0.8pt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
\tikzset{
|
||||||
|
legend entry/.style={
|
||||||
|
mark = *,
|
||||||
|
mark size = 1pt,
|
||||||
|
mark indices = {2},
|
||||||
|
line width=0.8pt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
\begin{tikzpicture}[>=latex]
|
||||||
|
|
||||||
|
\begin{axis}[
|
||||||
|
name=sim-1a,
|
||||||
|
xticklabel=\empty
|
||||||
|
]
|
||||||
|
\addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1a-normal.csv};
|
||||||
|
\addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1a-normal.csv};
|
||||||
|
\addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1a-normal.csv};
|
||||||
|
\addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1a-normal.csv};
|
||||||
|
\addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1a-normal.csv};
|
||||||
|
\end{axis}
|
||||||
|
\node[anchor = base west, yshift = 0.3em] at (sim-1a.north west) {
|
||||||
|
a: linear dependence on $\mathcal{F}_y \equiv y$
|
||||||
|
};
|
||||||
|
|
||||||
|
\begin{axis}[
|
||||||
|
name=sim-1b,
|
||||||
|
anchor=north west, at={(sim-1a.right of north east)}, xshift=0.1cm,
|
||||||
|
xticklabel=\empty,
|
||||||
|
ylabel near ticks, yticklabel pos=right
|
||||||
|
]
|
||||||
|
\addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1b-normal.csv};
|
||||||
|
\addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1b-normal.csv};
|
||||||
|
\addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1b-normal.csv};
|
||||||
|
\addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1b-normal.csv};
|
||||||
|
\addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1b-normal.csv};
|
||||||
|
\end{axis}
|
||||||
|
\node[anchor = base west, yshift = 0.3em] at (sim-1b.north west) {
|
||||||
|
b: cubic dependence on $y$
|
||||||
|
};
|
||||||
|
|
||||||
|
\begin{axis}[
|
||||||
|
name=sim-1c,
|
||||||
|
anchor=north west, at={(sim-1a.below south west)}, yshift=-.8em,
|
||||||
|
xticklabel=\empty
|
||||||
|
]
|
||||||
|
\addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1c-normal.csv};
|
||||||
|
\addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1c-normal.csv};
|
||||||
|
\addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1c-normal.csv};
|
||||||
|
\addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1c-normal.csv};
|
||||||
|
\addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1c-normal.csv};
|
||||||
|
\end{axis}
|
||||||
|
\node[anchor = base west, yshift = 0.3em] at (sim-1c.north west) {
|
||||||
|
c: rank $1$ $\boldsymbol{\beta}$'s
|
||||||
|
};
|
||||||
|
|
||||||
|
\begin{axis}[
|
||||||
|
name=sim-1d,
|
||||||
|
anchor=north west, at={(sim-1c.right of north east)}, xshift=0.1cm,
|
||||||
|
ylabel near ticks, yticklabel pos=right
|
||||||
|
]
|
||||||
|
\addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1d-normal.csv};
|
||||||
|
\addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1d-normal.csv};
|
||||||
|
\addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1d-normal.csv};
|
||||||
|
\addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1d-normal.csv};
|
||||||
|
\addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1d-normal.csv};
|
||||||
|
\end{axis}
|
||||||
|
\node[anchor = base west, yshift = 0.3em] at (sim-1d.north west) {
|
||||||
|
d: tri-diagonal $\boldsymbol{\Omega}$'s
|
||||||
|
};
|
||||||
|
|
||||||
|
\begin{axis}[
|
||||||
|
name=sim-1e,
|
||||||
|
anchor=north west, at={(sim-1c.below south west)}, yshift=-.8em
|
||||||
|
]
|
||||||
|
\addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1e-normal.csv};
|
||||||
|
\addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1e-normal.csv};
|
||||||
|
\addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1e-normal.csv};
|
||||||
|
\addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1e-normal.csv};
|
||||||
|
\addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1e-normal.csv};
|
||||||
|
\end{axis}
|
||||||
|
\node[anchor = base west, yshift = 0.3em] at (sim-1e.north west) {
|
||||||
|
e: missspecified
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
\matrix[anchor = center] at (sim-1e.right of east -| sim-1d.south) {
|
||||||
|
\draw[color=gmlm, legend entry, line width=1pt] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {GMLM}; \\
|
||||||
|
\draw[color=tsir, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {TSIR}; \\
|
||||||
|
\draw[color=mgcca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {MGCCA}; \\
|
||||||
|
\draw[color=hopca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {HOPCA}; \\
|
||||||
|
\draw[color=pca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {PCA}; \\
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
\node[anchor = north] at (current bounding box.south) {Sample Size $n$};
|
||||||
|
|
||||||
|
\node[anchor = south, rotate = 90] at (current bounding box.west) {Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$};
|
||||||
|
\node[anchor = south, rotate = 270] at (current bounding box.east) {\phantom{Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$}};
|
||||||
|
|
||||||
|
\node[anchor = south, font=\large] at (current bounding box.north) {Tensor Normal Simulation};
|
||||||
|
|
||||||
|
\end{tikzpicture}
|
||||||
|
\end{document}
|
|
@ -11,8 +11,8 @@ export(.projRank)
|
||||||
export(.projSymBand)
|
export(.projSymBand)
|
||||||
export(.projSymRank)
|
export(.projSymRank)
|
||||||
export(CISE)
|
export(CISE)
|
||||||
export(D)
|
export(Dup)
|
||||||
export(D.pinv)
|
export(Dup.pinv)
|
||||||
export(GMLM)
|
export(GMLM)
|
||||||
export(GMLM.default)
|
export(GMLM.default)
|
||||||
export(HOPCA)
|
export(HOPCA)
|
||||||
|
@ -32,6 +32,7 @@ export(RMReg)
|
||||||
export(RMap)
|
export(RMap)
|
||||||
export(S)
|
export(S)
|
||||||
export(TSIR)
|
export(TSIR)
|
||||||
|
export(approx.kron)
|
||||||
export(approx.kronecker)
|
export(approx.kronecker)
|
||||||
export(colKronecker)
|
export(colKronecker)
|
||||||
export(decompose.kronecker)
|
export(decompose.kronecker)
|
||||||
|
@ -41,6 +42,7 @@ export(dist.projection)
|
||||||
export(dist.subspace)
|
export(dist.subspace)
|
||||||
export(exprs.all.equal)
|
export(exprs.all.equal)
|
||||||
export(gmlm_ising)
|
export(gmlm_ising)
|
||||||
|
export(gmlm_mlm)
|
||||||
export(gmlm_tensor_normal)
|
export(gmlm_tensor_normal)
|
||||||
export(icu_tensor_normal)
|
export(icu_tensor_normal)
|
||||||
export(ising_m2)
|
export(ising_m2)
|
||||||
|
@ -72,6 +74,7 @@ export(reduce)
|
||||||
export(riccati)
|
export(riccati)
|
||||||
export(rowKronecker)
|
export(rowKronecker)
|
||||||
export(rtensornorm)
|
export(rtensornorm)
|
||||||
|
export(slice.assign.expr)
|
||||||
export(slice.expr)
|
export(slice.expr)
|
||||||
export(slice.select)
|
export(slice.select)
|
||||||
export(sylvester)
|
export(sylvester)
|
||||||
|
|
|
@ -1,3 +1,31 @@
|
||||||
|
#' @examples
|
||||||
|
#' nrows <- c(3, 2, 5)
|
||||||
|
#' ncols <- c(2, 4, 8)
|
||||||
|
#'
|
||||||
|
#' A <- Reduce(kronecker, Map(matrix, Map(rnorm, nrows * ncols), nrows))
|
||||||
|
#'
|
||||||
|
#' all.equal(A, Reduce(kronecker, approx.kron(A, nrows, ncols)))
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
approx.kron <- function(A, nrows, ncols) {
|
||||||
|
|
||||||
|
# rearrange kronecker product `A` into outer product `R`
|
||||||
|
dim(A) <- c(rev(nrows), rev(ncols))
|
||||||
|
axis.perm <- as.vector(t(matrix(seq_along(dim(A)), ncol = 2L))[, rev(seq_along(nrows))])
|
||||||
|
R <- aperm(A, axis.perm, resize = FALSE)
|
||||||
|
dim(R) <- nrows * ncols
|
||||||
|
|
||||||
|
# scaling factor for every product component
|
||||||
|
s <- sum(A^2)^(1 / length(dim(A)))
|
||||||
|
|
||||||
|
# higher order SVD on `R` with one singular vector
|
||||||
|
Map(function(mode, nr, nc) {
|
||||||
|
s * `dim<-`(La.svd(mcrossprod(R, mode = mode), 1L, 0L)$u, c(nr, nc))
|
||||||
|
}, seq_along(nrows), nrows, ncols)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#' Approximates Kronecker Product decomposition.
|
#' Approximates Kronecker Product decomposition.
|
||||||
#'
|
#'
|
||||||
#' Approximates the matrices `A` and `B` such that
|
#' Approximates the matrices `A` and `B` such that
|
||||||
|
|
|
@ -1,42 +1,110 @@
|
||||||
#' Specialized version of the GMLM for the Ising model (inverse Ising problem)
|
#' Specialized version of the GMLM for the Ising model (inverse Ising problem)
|
||||||
#'
|
#'
|
||||||
|
#' @todo TODO: Add beta and Omega projections
|
||||||
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
|
gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
|
||||||
|
# proj.betas = ..., proj.Omegas = ..., # TODO: this
|
||||||
max.iter = 1000L,
|
max.iter = 1000L,
|
||||||
eps = sqrt(.Machine$double.eps),
|
eps = sqrt(.Machine$double.eps),
|
||||||
step.size = function(iter) 1e-2 * (1000 / (iter + 1000)),
|
step.size = 1e-3,
|
||||||
zig.zag.threashold = 5L,
|
zig.zag.threashold = 20L,
|
||||||
patience = 5L,
|
patience = 3L,
|
||||||
nr.slices = 10L, # only for univariate `F(y) = y`
|
nr.slices = 20L, # only for univariate `F(y) = y`
|
||||||
slice.method = c("cut", "ecdf"), # only for univariate `F(y) = y` and `y` is a factor or integer
|
slice.method = c("cut", "ecdf", "none"), # only for univariate `F(y) = y` and `y` is a factor or integer
|
||||||
logger = function(...) { }
|
logger = function(...) { }
|
||||||
) {
|
) {
|
||||||
# # Special case for univariate response `vec F(y) = y`
|
# Get problem dimensions
|
||||||
# # Due to high computational costs we use slicing
|
dimX <- dim(X)[-sample.axis]
|
||||||
# if (length(F) == prod(dim(F))) {
|
# threat scalar `F` as a tensor
|
||||||
# y <- as.vector(F)
|
if (is.null(dim(F))) {
|
||||||
# if (!(is.factor(y) || is.integer(y))) {
|
dimF <- rep(1L, length(dimX))
|
||||||
# slice.method <- match.arg(slice.method)
|
dim(F) <- ifelse(seq_along(dim(X)) == sample.axis, sample.size, 1L)
|
||||||
# if (slice.method == "ecdf") {
|
} else {
|
||||||
# y <- cut(ecdf(y)(y), nr.slices)
|
dimF <- dim(F)[-sample.axis]
|
||||||
# } else {
|
}
|
||||||
# y <- cut(y, nr.slices)
|
sample.size <- dim(X)[sample.axis]
|
||||||
# }
|
|
||||||
# }
|
|
||||||
# slices <- split(seq_len(sample.size), y, drop = TRUE)
|
|
||||||
# } else {
|
|
||||||
# slices <- seq_len(sample.size)
|
|
||||||
# }
|
|
||||||
|
|
||||||
dimX <- head(dim(X), -1)
|
# rearrange `X`, `F` such that the last axis enumerates observations
|
||||||
dimF <- head(dim(F), -1)
|
if (sample.axis != length(dim(X))) {
|
||||||
sample.axis <- length(dim(X))
|
axis.perm <- c(seq_along(dim(X))[-sample.axis], sample.axis)
|
||||||
modes <- seq_len(length(dim(X)) - 1)
|
X <- aperm(X, axis.perm)
|
||||||
sample.size <- tail(dim(X), 1)
|
F <- aperm(F, axis.perm)
|
||||||
|
sample.axis <- length(dim(X))
|
||||||
|
}
|
||||||
|
modes <- seq_along(dimX)
|
||||||
|
|
||||||
betas <- Map(matrix, Map(rnorm, dimX * dimF), dimX)
|
# Special case for univariate response `vec F(y) = y`
|
||||||
Omegas <- Map(diag, dimX)
|
# Due to high computational costs we use slicing
|
||||||
|
slice.method <- match.arg(slice.method)
|
||||||
|
slices.ind <- if ((slice.method != "none") && (length(F) == prod(dim(F)))) {
|
||||||
|
y <- as.vector(F)
|
||||||
|
if (!(is.factor(y) || is.integer(y))) {
|
||||||
|
slice.method <- match.arg(slice.method)
|
||||||
|
if (slice.method == "ecdf") {
|
||||||
|
y <- cut(ecdf(y)(y), nr.slices)
|
||||||
|
} else {
|
||||||
|
y <- cut(y, nr.slices)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
split(seq_len(sample.size), y, drop = TRUE)
|
||||||
|
} else {
|
||||||
|
seq_len(sample.size)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# initialize betas with tensor normal estimate (ignoring data being binary)
|
||||||
|
fit_normal <- gmlm_tensor_normal(X, F, sample.axis = length(dim(X)))
|
||||||
|
betas <- fit_normal$betas
|
||||||
|
|
||||||
|
Omegas <- Omegas.init <- Map(function(mode) {
|
||||||
|
n <- prod(dim(X)[-mode])
|
||||||
|
prob2 <- mcrossprod(X, mode = mode) / n
|
||||||
|
prob2[prob2 == 0] <- 1 / n
|
||||||
|
prob1 <- diag(prob2)
|
||||||
|
`prob1^2` <- outer(prob1, prob1)
|
||||||
|
|
||||||
|
`diag<-`(log(((1 - `prob1^2`) / `prob1^2`) * prob2 / (1 - prob2)), 0)
|
||||||
|
}, modes)
|
||||||
|
|
||||||
|
# Determin degenerate combinations, that are variables which are exclusive
|
||||||
|
# in the data set
|
||||||
|
matX <- mat(X, sample.axis)
|
||||||
|
degen <- crossprod(matX) == 0
|
||||||
|
degen.mask <- which(degen)
|
||||||
|
# If there are degenerate combination, compute an (arbitrary) bound the
|
||||||
|
# log odds parameters of those combinations
|
||||||
|
if (any(degen.mask)) {
|
||||||
|
degen.ind <- arrayInd(degen.mask, dim(degen))
|
||||||
|
meanX <- colMeans(matX)
|
||||||
|
prodX <- meanX[degen.ind[, 1]] * meanX[degen.ind[, 2]]
|
||||||
|
degen.bounds <- log((1 - prodX) / (prodX * sample.size))
|
||||||
|
# Component indices in Omegas of degenerate two-way interactions
|
||||||
|
degen.ind <- arrayInd(degen.mask, rep(dimX, 2))
|
||||||
|
degen.ind <- Map(function(d, m) {
|
||||||
|
degen.ind[, m] + dimX[m] * (degen.ind[, m + length(dimX)] - 1L)
|
||||||
|
}, dimX, seq_along(dimX))
|
||||||
|
|
||||||
|
## Enforce initial value degeneracy interaction param. constraints
|
||||||
|
# Extract parameters corresponding to degenerate interactions
|
||||||
|
degen.params <- do.call(rbind, Map(`[`, Omegas, degen.ind))
|
||||||
|
# Degeneracy Constrained Parameters (sign is dropped)
|
||||||
|
DCP <- mapply(function(vals, bound) {
|
||||||
|
logVals <- log(abs(vals))
|
||||||
|
err <- max(0, sum(logVals) - log(abs(bound)))
|
||||||
|
exp(logVals - (err / length(vals)))
|
||||||
|
}, split(degen.params, col(degen.params)), degen.bounds)
|
||||||
|
# Update values in Omegas such that all degeneracy constraints hold
|
||||||
|
Omegas <- Map(function(Omega, cp, ind) {
|
||||||
|
# Combine multiple constraints for every element into single
|
||||||
|
# constraint value per element
|
||||||
|
cp <- mapply(min, split(abs(cp), ind))
|
||||||
|
ind <- as.integer(names(cp))
|
||||||
|
`[<-`(Omega, ind, sign(Omega[ind]) * cp)
|
||||||
|
}, Omegas, split(DCP, row(DCP)), degen.ind)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Initialize mean squared gradients
|
||||||
grad2_betas <- Map(array, 0, Map(dim, betas))
|
grad2_betas <- Map(array, 0, Map(dim, betas))
|
||||||
grad2_Omegas <- Map(array, 0, Map(dim, Omegas))
|
grad2_Omegas <- Map(array, 0, Map(dim, Omegas))
|
||||||
|
|
||||||
|
@ -49,12 +117,8 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
|
||||||
non_improving <- 0L
|
non_improving <- 0L
|
||||||
|
|
||||||
# technical access points to dynamicaly access a multi-dimensional array
|
# technical access points to dynamicaly access a multi-dimensional array
|
||||||
# with its last index
|
`X[..., i]` <- slice.expr(X, sample.axis, index = i, drop = FALSE)
|
||||||
`X[..., i]` <- slice.expr(X, sample.axis, index = i)
|
|
||||||
`F[..., i]` <- slice.expr(F, sample.axis, index = i, drop = FALSE)
|
`F[..., i]` <- slice.expr(F, sample.axis, index = i, drop = FALSE)
|
||||||
# the next expression if accessing the precomputed `mlm(F, betas)`
|
|
||||||
`BF[..., i]` <- slice.expr(BF, sample.axis, nr.axis = sample.axis, drop = FALSE) # BF[..., i]
|
|
||||||
|
|
||||||
|
|
||||||
# Iterate till a break condition triggers or till max. nr. of iterations
|
# Iterate till a break condition triggers or till max. nr. of iterations
|
||||||
for (iter in seq_len(max.iter)) {
|
for (iter in seq_len(max.iter)) {
|
||||||
|
@ -62,31 +126,36 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
|
||||||
grad_betas <- Map(matrix, 0, dimX, dimF)
|
grad_betas <- Map(matrix, 0, dimX, dimF)
|
||||||
Omega <- Reduce(kronecker, rev(Omegas))
|
Omega <- Reduce(kronecker, rev(Omegas))
|
||||||
|
|
||||||
|
# second order residuals accumulator
|
||||||
|
# `sum_i (X_i o X_i - E[X o X | Y = y_i])`
|
||||||
R2 <- array(0, dim = c(dimX, dimX))
|
R2 <- array(0, dim = c(dimX, dimX))
|
||||||
|
|
||||||
# negative log-likelihood
|
# negative log-likelihood
|
||||||
loss <- 0
|
loss <- 0
|
||||||
|
|
||||||
BF <- mlm(F, betas)
|
for (i in slices.ind) {
|
||||||
|
# slice size (nr. of objects in the slice)
|
||||||
|
n_i <- length(i)
|
||||||
|
|
||||||
for (i in seq_len(sample.size)) {
|
sumF_i <- rowSums(eval(`F[..., i]`), dims = length(dimF))
|
||||||
params_i <- Omega + diag(as.vector(eval(`BF[..., i]`)))
|
|
||||||
|
diag_params_i <- mlm(sumF_i / n_i, betas)
|
||||||
|
params_i <- Omega + diag(as.vector(diag_params_i))
|
||||||
m2_i <- ising_m2(params_i)
|
m2_i <- ising_m2(params_i)
|
||||||
|
|
||||||
# accumulate loss
|
# accumulate loss
|
||||||
x_i <- as.vector(eval(`X[..., i]`))
|
matX_i <- mat(eval(`X[..., i]`), modes)
|
||||||
loss <- loss - (sum(x_i * (params_i %*% x_i)) + log(attr(m2_i, "prob_0")))
|
loss <- loss - (
|
||||||
|
sum(matX_i * (params_i %*% matX_i)) + n_i * log(attr(m2_i, "prob_0"))
|
||||||
|
)
|
||||||
|
|
||||||
R2_i <- tcrossprod(x_i) - m2_i
|
R2_i <- tcrossprod(matX_i) - n_i * m2_i
|
||||||
R1_i <- diag(R2_i)
|
R1_i <- diag(R2_i)
|
||||||
dim(R1_i) <- dimX
|
dim(R1_i) <- dimX
|
||||||
|
|
||||||
for (j in modes) {
|
for (j in modes) {
|
||||||
grad_betas[[j]] <- grad_betas[[j]] +
|
grad_betas[[j]] <- grad_betas[[j]] +
|
||||||
mcrossprod(
|
mcrossprod(R1_i, mlm(sumF_i, betas[-j], modes[-j]), j)
|
||||||
R1_i, mlm(eval(`F[..., i]`), betas[-j], modes[-j]), j,
|
|
||||||
dimB = ifelse(j != modes, dimX, dimF)
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
R2 <- R2 + as.vector(R2_i)
|
R2 <- R2 + as.vector(R2_i)
|
||||||
}
|
}
|
||||||
|
@ -98,17 +167,14 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
|
||||||
}, modes)
|
}, modes)
|
||||||
|
|
||||||
|
|
||||||
# update and accumulate alternating loss
|
# update optimization behavioral trackers
|
||||||
accum_sign <- sign(last_loss - loss) - accum_sign
|
accum_sign <- sign(last_loss - loss) - accum_sign
|
||||||
# check if accumulated alternating signs exceed stopping threshold
|
non_improving <- max(0L, non_improving - 1L + 2L * (last_loss < loss))
|
||||||
|
|
||||||
|
# check break conditions
|
||||||
if (abs(accum_sign) > zig.zag.threashold) { break }
|
if (abs(accum_sign) > zig.zag.threashold) { break }
|
||||||
# increment non improving counter if thats the case
|
|
||||||
if (!(loss < last_loss)) {
|
|
||||||
non_improving <- non_improving + 1L
|
|
||||||
} else {
|
|
||||||
non_improving <- 0L
|
|
||||||
}
|
|
||||||
if (non_improving > patience) { break }
|
if (non_improving > patience) { break }
|
||||||
|
if (abs(last_loss - loss) < eps * last_loss) { break }
|
||||||
|
|
||||||
# store current loss for the next iteration
|
# store current loss for the next iteration
|
||||||
last_loss <- loss
|
last_loss <- loss
|
||||||
|
@ -122,17 +188,219 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)),
|
||||||
# logging (before parameter update)
|
# logging (before parameter update)
|
||||||
logger(iter, loss, betas, Omegas, grad_betas, grad_Omegas)
|
logger(iter, loss, betas, Omegas, grad_betas, grad_Omegas)
|
||||||
|
|
||||||
# gradualy decrease the step size
|
|
||||||
step <- if (is.function(step.size)) step.size(iter) else step.size
|
|
||||||
|
|
||||||
# Update Parameters
|
# Update Parameters
|
||||||
betas <- Map(function(beta, grad, m2) {
|
betas <- Map(function(beta, grad, m2) {
|
||||||
beta + (step / (sqrt(m2) + eps)) * grad
|
beta + (step.size / (sqrt(m2) + eps)) * grad
|
||||||
}, betas, grad_betas, grad2_betas)
|
}, betas, grad_betas, grad2_betas)
|
||||||
Omegas <- Map(function(Omega, grad, m2) {
|
Omegas <- Map(function(Omega, grad, m2) {
|
||||||
Omega + (step / (sqrt(m2) + eps)) * grad
|
Omega + (step.size / (sqrt(m2) + eps)) * grad
|
||||||
}, Omegas, grad_Omegas, grad2_Omegas)
|
}, Omegas, grad_Omegas, grad2_Omegas)
|
||||||
|
|
||||||
|
# Enforce degeneracy parameter constraints
|
||||||
|
if (any(degen.mask)) {
|
||||||
|
# Extract parameters corresponding to degenerate interactions
|
||||||
|
degen.params <- do.call(rbind, Map(`[`, Omegas, degen.ind))
|
||||||
|
# Degeneracy Constrained Parameters (sign is dropped)
|
||||||
|
DCP <- mapply(function(vals, bound) {
|
||||||
|
logVals <- log(abs(vals))
|
||||||
|
err <- max(0, sum(logVals) - log(abs(bound)))
|
||||||
|
exp(logVals - (err / length(vals)))
|
||||||
|
}, split(degen.params, col(degen.params)), degen.bounds)
|
||||||
|
# Update values in Omegas such that all degeneracy constraints hold
|
||||||
|
Omegas <- Map(function(Omega, cp, ind) {
|
||||||
|
# Combine multiple constraints for every element into single
|
||||||
|
# constraint value per element
|
||||||
|
cp <- mapply(min, split(abs(cp), ind))
|
||||||
|
ind <- as.integer(names(cp))
|
||||||
|
`[<-`(Omega, ind, sign(Omega[ind]) * cp)
|
||||||
|
}, Omegas, split(DCP, row(DCP)), degen.ind)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
list(betas = betas, Omegas = Omegas)
|
structure(
|
||||||
|
list(eta1 = array(0, dimX), betas = betas, Omegas = Omegas),
|
||||||
|
tensor_normal = fit_normal,
|
||||||
|
Omegas.init = Omegas.init,
|
||||||
|
degen.mask = degen.mask
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Development Interactive Block (Delete / Make sim / TODO: ...) ###
|
||||||
|
################################################################################
|
||||||
|
if (FALSE) { # interactive()
|
||||||
|
|
||||||
|
par(bg = "#1d1d1d",
|
||||||
|
fg = "lightgray",
|
||||||
|
col = "#d5d5d5",
|
||||||
|
col.axis = "#d5d5d5",
|
||||||
|
col.lab = "#d5d5d5",
|
||||||
|
col.main = "#d5d5d5",
|
||||||
|
col.sub = "#d5d5d5", # col.sub = "#2467d0"
|
||||||
|
pch = 16
|
||||||
|
)
|
||||||
|
cex <- 1.25
|
||||||
|
col <- colorRampPalette(c("#f15050", "#1d1d1d", "#567DCA"))(256)
|
||||||
|
|
||||||
|
|
||||||
|
.logger <- function() {
|
||||||
|
iter <- 0L
|
||||||
|
assign("log", data.frame(
|
||||||
|
iter = rep(NA_integer_, 100000),
|
||||||
|
loss = rep(NA_real_, 100000),
|
||||||
|
dist.B = rep(NA_real_, 100000),
|
||||||
|
dist.Omega = rep(NA_real_, 100000),
|
||||||
|
norm.grad.B = rep(NA_real_, 100000),
|
||||||
|
norm.grad.Omega = rep(NA_real_, 100000)
|
||||||
|
), envir = .GlobalEnv)
|
||||||
|
assign("B.gmlm", NULL, .GlobalEnv)
|
||||||
|
assign("Omega.gmlm", NULL, .GlobalEnv)
|
||||||
|
|
||||||
|
function(it, loss, betas, Omegas, grad_betas, grad_Omegas) {
|
||||||
|
# Store in global namespace (allows to stop and get the results)
|
||||||
|
B.gmlm <- Reduce(kronecker, rev(betas))
|
||||||
|
assign("B.gmlm", B.gmlm, .GlobalEnv)
|
||||||
|
Omega.gmlm <- Reduce(kronecker, rev(Omegas))
|
||||||
|
assign("Omega.gmlm", Omega.gmlm, .GlobalEnv)
|
||||||
|
|
||||||
|
dist.B <- dist.subspace(B.true, B.gmlm, normalize = TRUE)
|
||||||
|
dist.Omega <- norm(Omega.true - Omega.gmlm, "F")
|
||||||
|
norm.grad.B <- sqrt(sum(mapply(norm, grad_betas, "F")^2))
|
||||||
|
norm.grad.Omega <- sqrt(sum(mapply(norm, grad_Omegas, "F")^2))
|
||||||
|
|
||||||
|
log[iter <<- iter + 1L, ] <<- list(
|
||||||
|
it, loss, dist.B, dist.Omega, norm.grad.B, norm.grad.Omega
|
||||||
|
)
|
||||||
|
cat(sprintf("\r%3d - d(B): %.3f, d(O): %.3f, |g(B)|: %.3f, |g(O)|: %.3f, loss: %.3f\033[K",
|
||||||
|
it, dist.B, dist.Omega, norm.grad.B, norm.grad.Omega, loss))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sample.size <- 1000
|
||||||
|
dimX <- c(2, 3) # predictor `X` dimension
|
||||||
|
dimF <- rep(1, length(dimX)) # "function" `F(y)` of responce `y` dimension
|
||||||
|
|
||||||
|
betas <- Map(diag, 1, dimX, dimF)
|
||||||
|
Omegas <- list(toeplitz(c(0, -2)), toeplitz(seq(1, 0, by = -0.5)))
|
||||||
|
|
||||||
|
B.true <- Reduce(kronecker, rev(betas))
|
||||||
|
Omega.true <- Reduce(kronecker, rev(Omegas))
|
||||||
|
|
||||||
|
# data sampling routine
|
||||||
|
c(X, F, y, sample.axis) %<-% (sample.data <- function(sample.size, betas, Omegas) {
|
||||||
|
dimX <- mapply(nrow, betas)
|
||||||
|
dimF <- mapply(ncol, betas)
|
||||||
|
|
||||||
|
# generate response (sample axis is last axis)
|
||||||
|
y <- runif(prod(sample.size, dimF), -2, 2)
|
||||||
|
F <- array(y, dim = c(dimF, sample.size)) # ~ U[-1, 1]
|
||||||
|
|
||||||
|
Omega <- Reduce(kronecker, rev(Omegas))
|
||||||
|
|
||||||
|
X <- apply(F, length(dim(F)), function(Fi) {
|
||||||
|
dim(Fi) <- dimF
|
||||||
|
params <- diag(as.vector(mlm(Fi, betas))) + Omega
|
||||||
|
tensorPredictors::ising_sample(1, params)
|
||||||
|
})
|
||||||
|
dim(X) <- c(dimX, sample.size)
|
||||||
|
|
||||||
|
list(X = X, F = F, y = y, sample.axis = length(dim(X)))
|
||||||
|
})(sample.size, betas, Omegas)
|
||||||
|
|
||||||
|
local({
|
||||||
|
X.proto <- array(seq_len(prod(dimX)), dimX)
|
||||||
|
interactions <- crossprod(mat(X, sample.axis))
|
||||||
|
dimnames(interactions) <- rep(list(
|
||||||
|
do.call(paste0, c("X", Map(slice.index, list(X.proto), seq_along(dimX))))
|
||||||
|
), 2)
|
||||||
|
cat("Sample Size: ", sample.size, "\n")
|
||||||
|
print.table(interactions, zero.print = ".")
|
||||||
|
})
|
||||||
|
|
||||||
|
# system.time({
|
||||||
|
# fit.gmlm <- gmlm_ising(X, y, logger = .logger())
|
||||||
|
# })
|
||||||
|
Rprof()
|
||||||
|
gmlm_ising(X, y)
|
||||||
|
Rprof(NULL)
|
||||||
|
summaryRprof()
|
||||||
|
|
||||||
|
B.gmlm <- Reduce(kronecker, rev(fit.gmlm$betas))
|
||||||
|
Omega.gmlm <- Reduce(kronecker, rev(fit.gmlm$Omegas))
|
||||||
|
|
||||||
|
B.normal <- Reduce(kronecker, rev(attr(fit.gmlm, "tensor_normal")$betas))
|
||||||
|
Omega.init <- Reduce(kronecker, rev(attr(fit.gmlm, "Omegas.init")))
|
||||||
|
degen.mask <- attr(fit.gmlm, "degen.mask")
|
||||||
|
|
||||||
|
local({
|
||||||
|
layout(matrix(c(
|
||||||
|
1, 2, 3, 3, 3,
|
||||||
|
1, 4, 5, 6, 7
|
||||||
|
), nrow = 2, byrow = TRUE), width = c(6, 3, 1, 1, 1))
|
||||||
|
|
||||||
|
with(na.omit(log), {
|
||||||
|
plot(range(iter), c(0, 1), type = "n", bty = "n",
|
||||||
|
xlab = "Iterations", ylab = "Distance")
|
||||||
|
|
||||||
|
lines(iter, dist.B, col = "red", lwd = 2)
|
||||||
|
lines(iter, dist.Omega / max(dist.Omega), col = "blue", lwd = 2)
|
||||||
|
lines(iter, (loss - min(loss)) / diff(range(loss)), col = "darkgreen", lwd = 2)
|
||||||
|
|
||||||
|
norm.grad <- sqrt(norm.grad.B^2 + norm.grad.Omega^2)
|
||||||
|
# Scale all gradient norms
|
||||||
|
norm.grad.B <- norm.grad.B / max(norm.grad)
|
||||||
|
norm.grad.Omega <- norm.grad.Omega / max(norm.grad)
|
||||||
|
norm.grad <- norm.grad / max(norm.grad)
|
||||||
|
lines(iter, norm.grad.B, lty = 2, col = "red")
|
||||||
|
lines(iter, norm.grad.Omega, lty = 2, col = "blue")
|
||||||
|
lines(iter, norm.grad, lty = 2, col = "darkgreen")
|
||||||
|
|
||||||
|
axis(4, at = c(
|
||||||
|
tail(dist.B, 1),
|
||||||
|
min(dist.B)
|
||||||
|
), labels = round(c(
|
||||||
|
tail(dist.B, 1),
|
||||||
|
min(dist.B)
|
||||||
|
), 2), col = NA, col.ticks = "red", las = 1)
|
||||||
|
axis(4, at = c(
|
||||||
|
1,
|
||||||
|
tail(dist.Omega, 1) / max(dist.Omega),
|
||||||
|
min(dist.Omega) / max(dist.Omega)
|
||||||
|
), labels = round(c(
|
||||||
|
max(dist.Omega),
|
||||||
|
tail(dist.Omega, 1),
|
||||||
|
min(dist.Omega)
|
||||||
|
), 2), col = NA, col.ticks = "blue", las = 1)
|
||||||
|
|
||||||
|
abline(h = c(tail(dist.B, 1), min(dist.B)),
|
||||||
|
lty = "dotted", col = "red")
|
||||||
|
abline(h = c(max(dist.Omega), tail(dist.Omega, 1), min(dist.Omega)) / max(dist.Omega),
|
||||||
|
lty = "dotted", col = "blue")
|
||||||
|
|
||||||
|
})
|
||||||
|
legend("topright", col = c("red", "blue", "darkgreen"), lty = 1, lwd = 2,
|
||||||
|
legend = c("dist.B", "dist.Omega", "loss"), bty = "n")
|
||||||
|
|
||||||
|
zlim <- max(abs(range(Omega.true, Omega.init, Omega.gmlm))) * c(-1, 1)
|
||||||
|
matrixImage(Omega.true, main = "true", zlim = zlim, add.values = TRUE, col = col, cex = cex)
|
||||||
|
matrixImage(round(Omega.init, 2), main = "init (cond. prob.)", zlim = zlim, add.values = TRUE, col = col, cex = cex)
|
||||||
|
mtext(round(norm(Omega.true - Omega.init, "F"), 3), 3)
|
||||||
|
matrixImage(round(Omega.gmlm, 2), main = "gmlm (ising)", zlim = zlim, add.values = TRUE, col = col, cex = cex,
|
||||||
|
col.values = c(par("col"), "red")[`[<-`(array(1, rep(prod(dim(X)[-sample.axis]), 2)), degen.mask, 2)])
|
||||||
|
mtext(round(norm(Omega.true - Omega.gmlm, "F"), 3), 3)
|
||||||
|
|
||||||
|
zlim <- max(abs(range(B.true, B.normal, B.gmlm))) * c(-1, 1)
|
||||||
|
matrixImage(B.true, main = "true",
|
||||||
|
zlim = zlim, add.values = TRUE, col = col, cex = cex)
|
||||||
|
matrixImage(round(B.normal, 2), main = "init (normal)",
|
||||||
|
zlim = zlim, add.values = TRUE, axes = FALSE, col = col, cex = cex)
|
||||||
|
mtext(round(dist.subspace(B.true, B.normal, normalize = TRUE), 3), 3)
|
||||||
|
matrixImage(round(B.gmlm, 2), main = "gmlm (ising)",
|
||||||
|
zlim = zlim, add.values = TRUE, axes = FALSE, col = col, cex = cex)
|
||||||
|
mtext(round(dist.subspace(B.true, B.gmlm, normalize = TRUE), 3), 3)
|
||||||
|
})
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,109 +1,10 @@
|
||||||
|
|
||||||
# p <- 5
|
|
||||||
# A <- matrix(rnorm(p^2), p)
|
|
||||||
|
|
||||||
# mat.proj("TriDiag", p)(A)
|
|
||||||
# mat.proj("SymTriDiag", p)(A)
|
|
||||||
|
|
||||||
# A
|
|
||||||
# (AA <- mat.proj("PSD", p)(A))
|
|
||||||
# mat.proj("PSD", p)(AA)
|
|
||||||
|
|
||||||
|
|
||||||
# p <- 5
|
|
||||||
# A <- matrix(rnorm(p^2), p)
|
|
||||||
|
|
||||||
# projection <- function(T2) {
|
|
||||||
# P <- pinv(T2) %*% T2
|
|
||||||
# function(A) {
|
|
||||||
# matrix(P %*% as.vector(A), nrow(A))
|
|
||||||
# }
|
|
||||||
# }
|
|
||||||
|
|
||||||
|
|
||||||
# # All equal diagonal matrix, A = a * I_p
|
|
||||||
# T2 <- t(as.vector(diag(p)))
|
|
||||||
# proj <- projection(T2)
|
|
||||||
|
|
||||||
# print.table( diag(mean(diag(A)), nrow(A), ncol(A)) , zero.print = ".")
|
|
||||||
# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p) , zero.print = ".")
|
|
||||||
# print.table( proj(A) , zero.print = ".")
|
|
||||||
|
|
||||||
|
|
||||||
# # Diagonal matrix, A = diag(a_1, ..., a_p)
|
|
||||||
# T2 <- matrix(seq_len(p^2), p)
|
|
||||||
# T2 <- outer(diag(T2), as.vector(T2), `==`)
|
|
||||||
# storage.mode(T2) <- "double"
|
|
||||||
# proj <- projection(T2)
|
|
||||||
|
|
||||||
# print.table( T2 , zero.print = ".")
|
|
||||||
|
|
||||||
# print.table( diag(diag(A)) , zero.print = ".")
|
|
||||||
# print.table( proj(A) , zero.print = ".")
|
|
||||||
|
|
||||||
|
|
||||||
# # Tri-Diagonal Matrix
|
|
||||||
# T2 <- matrix(seq_len(p^2), p)
|
|
||||||
# T2 <- outer(T2[abs(row(A) - col(A)) <= 1], as.vector(T2), `==`)
|
|
||||||
# storage.mode(T2) <- "double"
|
|
||||||
|
|
||||||
# triDiag.mask <- (abs(row(A) - col(A)) <= 1)
|
|
||||||
# storage.mode(triDiag.mask) <- "double"
|
|
||||||
|
|
||||||
# print.table( T2 , zero.print = ".")
|
|
||||||
# print.table( triDiag.mask , zero.print = ".")
|
|
||||||
|
|
||||||
# print.table( A * triDiag.mask , zero.print = ".")
|
|
||||||
# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p) , zero.print = ".")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# # All equal main and off diagonals
|
|
||||||
# T2 <- Reduce(rbind, Map(function(i) {
|
|
||||||
# as.double(row(A) == i + col(A))
|
|
||||||
# }, (1 - p):(p - 1)))
|
|
||||||
|
|
||||||
# print.table( T2 , zero.print = ".")
|
|
||||||
|
|
||||||
# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p) , zero.print = ".")
|
|
||||||
|
|
||||||
|
|
||||||
# # Symmetric all equal main and off diagonals
|
|
||||||
# T2 <- Reduce(rbind, Map(function(i) {
|
|
||||||
# as.double(abs(row(A) - col(A)) == i)
|
|
||||||
# }, 0:(p - 1)))
|
|
||||||
|
|
||||||
# print.table( T2 , zero.print = ".")
|
|
||||||
|
|
||||||
# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p) , zero.print = ".")
|
|
||||||
|
|
||||||
|
|
||||||
# # Symetric Matrix
|
|
||||||
# index <- matrix(seq_len(p^2), p)
|
|
||||||
# T2 <- Reduce(rbind, Map(function(i) {
|
|
||||||
# e_i <- index == i
|
|
||||||
# as.double(e_i | t(e_i))
|
|
||||||
# }, index[lower.tri(index, diag = TRUE)]))
|
|
||||||
# proj <- projection(T2)
|
|
||||||
|
|
||||||
# print.table( T2 , zero.print = ".")
|
|
||||||
|
|
||||||
# print.table( 0.5 * (A + t(A)) , zero.print = ".")
|
|
||||||
# print.table( proj(A) , zero.print = ".")
|
|
||||||
|
|
||||||
# proj(solve(A))
|
|
||||||
# solve(proj(A))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#' Specialized version of GMLM for the tensor normal model
|
#' Specialized version of GMLM for the tensor normal model
|
||||||
#'
|
#'
|
||||||
#' The underlying algorithm is an ``iterative (block) coordinate descent'' method
|
#' The underlying algorithm is an ``iterative (block) coordinate descent'' method
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)),
|
gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)),
|
||||||
max.iter = 100L, proj.betas = NULL, proj.Omegas = NULL, logger = NULL, # TODO: proj.betas NOT USED!!!
|
max.iter = 100L, proj.betas = NULL, proj.Omegas = NULL, logger = NULL,
|
||||||
cond.threshold = 25, eps = 1e-6
|
cond.threshold = 25, eps = 1e-6
|
||||||
) {
|
) {
|
||||||
# rearrange `X`, `F` such that the last axis enumerates observations
|
# rearrange `X`, `F` such that the last axis enumerates observations
|
||||||
|
@ -152,7 +53,7 @@ gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)),
|
||||||
}, mcov(F, sample.axis, center = FALSE))
|
}, mcov(F, sample.axis, center = FALSE))
|
||||||
|
|
||||||
# initialization of betas ``covariance direction mappings``
|
# initialization of betas ``covariance direction mappings``
|
||||||
betas <- Map(function(dX, dF) {
|
betas <- betas.init <- Map(function(dX, dF) {
|
||||||
s <- min(ncol(dX), nrow(dF))
|
s <- min(ncol(dX), nrow(dF))
|
||||||
crossprod(dX[1:s, , drop = FALSE], dF[1:s, , drop = FALSE])
|
crossprod(dX[1:s, , drop = FALSE], dF[1:s, , drop = FALSE])
|
||||||
}, dirsX, dirsF)
|
}, dirsX, dirsF)
|
||||||
|
@ -226,132 +127,8 @@ gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
list(eta1 = meanX, betas = betas, Omegas = Omegas)
|
structure(
|
||||||
|
list(eta1 = mlm(meanX, Sigmas), betas = betas, Omegas = Omegas),
|
||||||
|
betas.init = betas.init
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# #### DEBUGGING
|
|
||||||
# library(tensorPredictors)
|
|
||||||
|
|
||||||
# # setup dimensions
|
|
||||||
# n <- 1e3
|
|
||||||
# p <- c(5, 7, 3)
|
|
||||||
# q <- c(2, 5, 3)
|
|
||||||
|
|
||||||
# max.iter <- 10L
|
|
||||||
|
|
||||||
# # create "true" GLM parameters
|
|
||||||
# eta1 <- array(rnorm(prod(p)), dim = p)
|
|
||||||
# betas <- Map(matrix, Map(rnorm, p * q), p)
|
|
||||||
# Omegas <- Map(function(p_j) {
|
|
||||||
# solve(0.5^abs(outer(seq_len(p_j), seq_len(p_j), `-`)))
|
|
||||||
# }, p)
|
|
||||||
# true.params <- list(eta1 = eta1, betas = betas, Omegas = Omegas)
|
|
||||||
|
|
||||||
# # compute tensor normal parameters from GMLM parameters
|
|
||||||
# Sigmas <- Map(solve, Omegas)
|
|
||||||
# mu <- mlm(eta1, Sigmas)
|
|
||||||
|
|
||||||
# # sample some test data
|
|
||||||
# sample.axis <- length(p) + 1L
|
|
||||||
# F <- array(rnorm(n * prod(q)), dim = c(q, n))
|
|
||||||
# X <- mlm(F, Map(`%*%`, Sigmas, betas)) + rtensornorm(n, mu, Sigmas, sample.axis)
|
|
||||||
|
|
||||||
# # setup a logging callback
|
|
||||||
# format <- sprintf("iter: %%3d, %s, Omega: %%8.2f, resid: %%8.3f\n",
|
|
||||||
# paste0("beta.", seq_along(betas), ": %.3f", collapse = ", ")
|
|
||||||
# )
|
|
||||||
# hist <- data.frame(
|
|
||||||
# iter = seq(0, max.iter),
|
|
||||||
# dist.B = numeric(max.iter + 1),
|
|
||||||
# dist.Omega = numeric(max.iter + 1),
|
|
||||||
# resid = numeric(max.iter + 1)
|
|
||||||
# )
|
|
||||||
# logger <- function(iter, betas, Omegas, resid) {
|
|
||||||
# dist.B <- dist.kron.norm(true.params$betas, betas)
|
|
||||||
# dist.Omega <- dist.kron.norm(true.params$Omegas, Omegas)
|
|
||||||
# resid <- mean(resid^2)
|
|
||||||
|
|
||||||
# hist[iter + 1, -1] <<- c(dist.B = dist.B, dist.Omega = dist.Omega, resid = resid)
|
|
||||||
# cat(do.call(sprintf, c(
|
|
||||||
# list(format, iter),
|
|
||||||
# Map(dist.subspace, betas, true.params$betas),
|
|
||||||
# dist.Omega, resid
|
|
||||||
# )))
|
|
||||||
# }
|
|
||||||
|
|
||||||
# # run the model
|
|
||||||
# est.params <- gmlm_tensor_normal(X, F, max.iter = max.iter, logger = logger)
|
|
||||||
|
|
||||||
# # run model with Omegas in sub-manifolds of SPD matrices
|
|
||||||
# est.params <- gmlm_tensor_normal(X, F, max.iter = max.iter, logger = logger,
|
|
||||||
# proj.Omegas = list(
|
|
||||||
# NULL,
|
|
||||||
# (function(nrow, ncol) {
|
|
||||||
# triDiag.mask <- (abs(.row(c(nrow, ncol)) - .col(c(nrow, ncol))) <= 1)
|
|
||||||
# storage.mode(triDiag.mask) <- "double"
|
|
||||||
# function(A) { A * triDiag.mask }
|
|
||||||
# })(p[2], p[2]),
|
|
||||||
# function(A) diag(diag(A))
|
|
||||||
# ))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# # # Profile the fitting routine without logging
|
|
||||||
# # profvis::profvis({
|
|
||||||
# # est.params <- gmlm_tensor_normal(X, F, max.iter = max.iter)
|
|
||||||
# # })
|
|
||||||
|
|
||||||
# with(hist, {
|
|
||||||
# plot(range(iter), range(hist[, -1]), type = "n", log = "y")
|
|
||||||
# lines(iter, resid, col = "red")
|
|
||||||
# lines(iter, dist.B, col = "green")
|
|
||||||
# lines(iter, dist.Omega, col = "blue")
|
|
||||||
# legend("topright", legend = c("resid", "dist.B", "dist.Omega"), col = c("red", "green", "blue"), lty = 1)
|
|
||||||
# })
|
|
||||||
|
|
||||||
|
|
||||||
# # par(mfrow = c(1, 2))
|
|
||||||
|
|
||||||
# # matrixImage(Reduce(kronecker, rev(est.params$Omegas)))
|
|
||||||
# # matrixImage(Reduce(kronecker, rev(true.params$Omegas)))
|
|
||||||
# # # matrixImage(Reduce(kronecker, rev(est.params$Omegas)) - Reduce(kronecker, rev(true.params$Omegas)))
|
|
||||||
|
|
||||||
# # par(mfrow = c(1, 2))
|
|
||||||
|
|
||||||
# # matrixImage(Reduce(kronecker, rev(true.params$betas)), main = "True", col = hcl.colors(48, palette = "Blue-Red 3"))
|
|
||||||
# # matrixImage(Reduce(kronecker, rev(est.params$betas)), main = "Est", col = hcl.colors(48, palette = "Blue-Red 3"))
|
|
||||||
|
|
||||||
|
|
||||||
# # # unlist(Map(kappa, mcov(X)))
|
|
||||||
# # # unlist(Map(kappa, Omegas))
|
|
||||||
# # # unlist(Map(kappa, Sigmas))
|
|
||||||
|
|
||||||
|
|
||||||
# # # max(unlist(Map(function(C) max(eigen(C)$values), mcov(X))))
|
|
||||||
# # # min(unlist(Map(function(C) min(eigen(C)$values), mcov(X))))
|
|
||||||
# # # Map(function(C) eigen(C)$values, mcov(X))
|
|
||||||
|
|
||||||
# # # kappa(Reduce(kronecker, mcov(X)))
|
|
||||||
|
|
||||||
# # kappa1.fun <- function(Omegas) Map(kappa, Omegas)
|
|
||||||
# # kappa2.fun <- function(Omegas) Map(kappa, Omegas, exact = TRUE)
|
|
||||||
# # eigen1.fun <- function(Omegas) {
|
|
||||||
# # Map(function(Omega) {
|
|
||||||
# # min_max <- range(eigen(Omega)$values)
|
|
||||||
# # min_max[2] / min_max[1]
|
|
||||||
# # }, Omegas)
|
|
||||||
# # }
|
|
||||||
# # eigen2.fun <- function(Omegas) {
|
|
||||||
# # Map(function(Omega) {
|
|
||||||
# # min_max <- range(eigen(Omega, TRUE, TRUE)$values)
|
|
||||||
# # min_max[2] / min_max[1]
|
|
||||||
# # }, Omegas)
|
|
||||||
# # }
|
|
||||||
# # microbenchmark::microbenchmark(
|
|
||||||
# # kappa1.fun(Omegas),
|
|
||||||
# # kappa2.fun(Omegas),
|
|
||||||
# # eigen1.fun(Omegas),
|
|
||||||
# # eigen2.fun(Omegas)
|
|
||||||
# # )
|
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
matrixImage <- function(A, add.values = FALSE,
|
matrixImage <- function(A, add.values = FALSE,
|
||||||
main = NULL, sub = NULL, interpolate = FALSE, ..., zlim = NA,
|
main = NULL, sub = NULL, interpolate = FALSE, ..., zlim = NA,
|
||||||
axes = TRUE, asp = 1, col = hcl.colors(24, "Blue-Red 3", rev = FALSE),
|
axes = TRUE, asp = 1, col = hcl.colors(24, "Blue-Red 3", rev = FALSE),
|
||||||
|
col.values = par("col"), cex = 1,
|
||||||
digits = getOption("digits"), new.plot = TRUE
|
digits = getOption("digits"), new.plot = TRUE
|
||||||
) {
|
) {
|
||||||
# plot raster image
|
# plot raster image
|
||||||
|
@ -56,6 +57,7 @@ matrixImage <- function(A, add.values = FALSE,
|
||||||
A[!add.values] <- NA
|
A[!add.values] <- NA
|
||||||
A[add.values] <- format(A[add.values], digits = digits)
|
A[add.values] <- format(A[add.values], digits = digits)
|
||||||
}
|
}
|
||||||
text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A, adj = 0.5)
|
text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A,
|
||||||
|
adj = 0.5, cex = cex, col = col.values)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue