diff --git a/LaTeX/paper.tex b/LaTeX/paper.tex index 110e8a2..935725a 100644 --- a/LaTeX/paper.tex +++ b/LaTeX/paper.tex @@ -487,15 +487,15 @@ Equation \eqref{eq:eta2} is underdetermined since $\t{(\mat{T}_2\pinv{\mat{D}_p} \todo{Maybe already here introduce the ``constraint'' set of $\Omega$'s allowed as $\{ \Omega\in\mathbb{R}_{++}^{p\times p} : \vec{\Omega} = \t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p})}\vec{\Omega} \}$} In a classical \emph{generalized linear model} (GLM), the link function connecting the natural parameters to the expectation of the sufficient statistic $\mat{\eta}_y = \mat{g}(\E[\mat{t}(\ten{X}) \mid Y = y])$ is invertible. Such a link may not exist in our setting, but for our purpose what we call the ``inverse'' link suffices. The ``inverse'' link $\widetilde{\mat{g}}$ exists as the natural parameters fully describe the distribution. As in the non-minimal formulation \eqref{eq:quadratic-exp-fam}, we define the ``inverse'' link through its tensor valued components -\begin{align*} - \ten{g}_1(\mat{\eta}_y) &= \E[\ten{X} \mid Y = y], \\ - \ten{g}_2(\mat{\eta}_y) &= \E[\ten{X}\circ\ten{X} \mid Y = y] -\end{align*} +\begin{align} + \ten{g}_1(\mat{\eta}_y) &= \E[\ten{X} \mid Y = y], \label{eq:inv-link1}\\ + \ten{g}_2(\mat{\eta}_y) &= \E[\ten{X}\circ\ten{X} \mid Y = y] \label{eq:inv-link2} +\end{align} as $\widetilde{\mat{g}}(\mat{\eta}_y) = (\vec\ten{g}_1(\mat{\eta}_y), \vec\ten{g}_2(\mat{\eta}_y))$. Under the quadratic exponential family model \eqref{eq:quadratic-exp-fam}, a sufficient reduction for the regression of $Y$ on $\ten{X}$ is given in \cref{thm:sdr}. \begin{theorem}[SDR]\label{thm:sdr} - A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:quadratic-exp-fam} with natural parameters \eqref{eq:eta1} and \eqref{eq:eta2} is given by + A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:quadratic-exp-fam} with natural parameters modeled \eqref{eq:eta1} and \eqref{eq:eta2} is given by \begin{align}\label{eq:sdr} \ten{R}(\ten{X}) % &= (\ten{X} - \E\ten{X})\times\{ \t{\mat{\beta}_1}, \ldots, \t{\mat{\beta}_r} \}. @@ -551,7 +551,7 @@ The maximum likelihood estimate of $\mat{\theta}_0$ is the solution to the optim \end{equation} with $\hat{\mat{\theta}}_n = (\vec\widehat{\overline{\ten{\eta}}}, \vec\widehat{\mat{B}}, \vech\widetilde{\mat{\Omega}})$ where $\widehat{\mat{B}} = \bigkron_{k = r}^{1}\widehat{\mat{\beta}}_k$ and $\widehat{\mat{\Omega}} = \bigkron_{k = r}^{1}\widehat{\mat{\Omega}}_k$. -A straightforward and general method for parameter estimation is \emph{gradient descent}. For pure algorithmic speedup, by only changing the update rule but \emph{not} the gradient denoted $\nabla_{\mat{\theta}}l_n$ of the objective function, we use an adapted version of \emph{Nesterov Accelerated Gradient} descent (NAG) originaly described in \textcite{Nesterov1983} with an internal line search to determine the step size in each iteration, as described in \cref{alg:NAGD}. For applying gradient based optimization, we compute the gradients of $l_n$ in \cref{thm:grad}. +A straightforward and general method for parameter estimation is \emph{gradient descent}. For applying gradient based optimization, we compute the gradients of $l_n$ in \cref{thm:grad}. \begin{theorem}\label{thm:grad} For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the log-likelihood has the form \eqref{eq:log-likelihood} with $\mat{\theta}$ being the collection of all GMLM parameters $\overline{\ten{\eta}}$, ${\mat{B}} = \bigkron_{k = r}^{1}{\mat{\beta}}_k$ and ${\mat{\Omega}} = \bigkron_{k = r}^{1}{\mat{\Omega}}_k$ for $k = 1, ..., r$. Furthermore, let $\ten{G}_2(\mat{\eta}_y)$ be a tensor of dimensions $p_1, \ldots, p_r$ such that @@ -569,43 +569,84 @@ A straightforward and general method for parameter estimation is \emph{gradient If $\mat{T}_2$ is the identity matrix $\mat{I}_{p(p + 1) / 2}$, then $\ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y)$. \end{theorem} -\begin{algorithm}[hpt!] - \caption{\label{alg:NAGD}Nesterov Accelerated Gradient Descent} - \begin{algorithmic}[1] - \State Objective: $l_n(\mat{\theta})$ - \State Arguments: Order $r + 1$ tensors $\ten{X}$, $\ten{F}$ \todo{ask Effie about input description} - \State Initialize: Parameters $\mat{\theta}^{(0)}$, $0 < \delta^{(1)}$ and $0 < \gamma < 1$ - \\ - \State $t \leftarrow 1$ - \Comment{step counter} - \State $\mat{\theta}^{(1)} \leftarrow \mat{\theta}^{(0)}$ - \Comment{artificial first step} - \State $(m^{(0)}, m^{(1)}) \leftarrow (0, 1)$ - \Comment{momentum extrapolation weights} - \\ - \Repeat \Comment{repeat untill convergence} - \State $\mat{M} \leftarrow \mat{\theta}^{(t)} + \frac{m^{(t - 1)} - 1}{m^{(t)}}(\mat{\theta}^{(t)} - \mat{\theta}^{(t - 1)})$ \Comment{momentum extrapolation} - \For{$\delta = \gamma^{-1}\delta^{(t)}, \delta^{(t)}, \gamma\delta^{(t)}, \gamma^2\delta^{(t)}, \ldots$} \Comment{line search} - \State $\mat{\theta} \leftarrow \mat{M} + \delta \nabla l_n(\mat{M})$ - \If{$l_n(\mat{\theta}) \leq l_n(\mat{\theta}^{(t - 1)}) - \delta \|\nabla_{\mat{\theta}} l_n(\mat{M})\|_F^2$} \Comment{Armijo condition} - \State $\mat{\theta}^{(t + 1)} \leftarrow \mat{\theta}$ - \State $\delta^{(t + 1)} \leftarrow \delta$ - \Break - \EndIf - \EndFor - \State $m^{(t + 1)} \leftarrow \frac{1 + \sqrt{1 + (2 m^{(t)})^2}}{2}$ \Comment{update extrapolation weights} - \State $t \leftarrow t + 1$ - \Until converged - \end{algorithmic} -\end{algorithm} +Although the general case of any GMLM model can be fitted via gradient descent using \cref{thm:grad}, this may be very inefficient. In case of the tensor normal distribution an iterative cyclic updating scheme can be derived by setting the partial gradients of \cref{thm:grad} to zero. Described in more details in \cref{sec:tensor_normal_estimation}, this method has much faster convergence, is stabel and does not requires any hyper parameters. On the other hand, the Ising model does not allow such a scheme. There we need to use a gradient based method, which is the subject of \cref{sec:ising_estimation}. -\todo{Update the following, especially check that $\ten{X}$ needs to be center (as in the current version implicitly assumed)} -\cref{alg:NAGD} requires initial parameter estimates $\mat{\theta}^{(0)}$. Therefore, we choose a simple heuristic approach by starting with $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$ and $\overline{\ten{\eta}}_1^{(0)}$ dependent on the particular distribution. For example for a Tensor Normal distribution the sample mean is a good choice while for the Ising model zero initialization has nice properties. The interesting part are the multi-linear predictors $\mat{\beta}_k$. Therefore, let $\mat{\Sigma}_k = \frac{q_k}{n q}\sum_{i = 1}^n {\ten{F}_{y_i}}_{(k)}\t{{\ten{F}_{y_i}}_{(k)}}$ and $\mat{\Delta}_k = \frac{p_k}{n p}\sum_{i = 1}^n {\ten{X}_{i}}_{(k)}\t{{\ten{X}_{i}}_{(k)}}$ mode wise sample covariance estimates. For both we take the $s_k = \min(p_k, q_k)$ order SVD approximation as $\mat{\Delta}_k \approx \mat{U}_k\mat{D}_k\t{\mat{U}_k}$ and $\mat{\Sigma}_k \approx \mat{V}_k\mat{S}_k\t{\mat{V}_k}$. The approximate SVD gives us the $s_k$ first singular vectors with dimensions $\mat{U}_k(p_k\times s_k), \mat{D}_k(s_k\times s_k)$ and $\mat{V}_k(q_k\times s_k), \mat{S}_k(s_k\times s_k)$. Then we initialize +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Tensor Normal}\label{sec:tensor_normal_estimation} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +With $\ten{X}\mid Y = y$ following a tensor normal distribution, its density, with mean $\ten{\mu}_y$ and covariance $\mat{\Sigma} = \bigkron_{k = r}^{1}\mat{\Sigma}_k$ is given by \begin{displaymath} - \mat{\beta}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\sqrt{\mat{S}_k}\t{\mat{V}_k}. + f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p / 2}\prod_{k = 1}^{r}\det(\mat{\Sigma}_k)^{-p / 2 p_k}\exp\left( -\frac{1}{2}\left\langle\ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k = 1}^{r}\mat{\Sigma}_k^{-1} \right\rangle \right). +\end{displaymath} +We assume the distribution is non-degenerate which means that the covariances $\mat{\Sigma}_k$ are symmetric positive definite matrices. For the sake of simplicity, we w.l.o.g. assume $\ten{X}$ to have marginal expectation equal zero, $\E\ten{X} = 0$. Rewriting this in the quadratic exponential family form \eqref{eq:quadratic-exp-fam}, determines tha scaling constant $c = -1/2$, and give the relation to the GMLM parameters $\overline{\ten{\eta}}, \mat{\beta}_k$ and $\mat{\Omega}_k$, for $k = 1, \ldots, r$ as +\begin{displaymath} + \ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k, \qquad + \mat{\Omega}_k = \mat{\Sigma}_k^{-1} +\end{displaymath} +where we used that $\overline{\ten{\eta}} = 0$ due to $0 = \E\ten{X} = \E\E[\ten{X}\mid Y] = \E\ten{\mu}_Y$ in combination with $\E\ten{F}_Y = 0$. Additionally, all the $\mat{\Omega}_k$'s are symmetric positive definite, because the $\mat{\Sigma}_k$'s are. This lead to another simplification the matrix $\mat{T}_2$ from \eqref{eq:t-stat} is equal to the identity. This also means that the gradients of the log-likelihood $l_n$ in \cref{thm:grad} are simpler as well. We get +\begin{displaymath} + \ten{g}_1(\mat{\eta}_y) = \E[\ten{X}\mid Y = y] = \ten{\mu}_y, \qquad + \ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y) = \E[\ten{X}\circ\ten{X}\mid Y = y] \equiv \bigkron_{k = r}^1\mat{\Sigma}_k + (\vec{\ten{\mu}}_y)\t{(\vec{\ten{\mu}}_y)}. \end{displaymath} -The default hyper-parameters in \cref{alg:NAGD} are set to $\delta^{(1)} = 1 / 100$ for the initial step size and the line search step parameter $\gamma = 2 / (1 + \sqrt{5})$ is the inverse golden ratio. +In practice, we start the estimation process by demeaning a given data set $(\ten{X}_i, \ten{F}_{y_i})$ with $i = 1, \ldots, n$ observations. After demenaing, only the reduction matrices $\mat{\beta}_k$ and the scatter matrices $\mat{\Omega}_k$ need to be estimated. Then, to solve the optimization problem \eqref{eq:mle}, with $\overline{\ten{\eta}} = 0$ after demeaning, we need some initial values. For initial estimates $\hat{\mat{\beta}}_k^{(0)}$ we use a simple heuristic approach. First, we compute moment based mode-wise marginal covariance estimates $\widehat{\mat{\Sigma}}_k(\ten{X})$ and $\widehat{\mat{\Sigma}}_k(\ten{F}_Y)$ given by +\begin{displaymath} + \widehat{\mat{\Sigma}}_k(\ten{X}) = \frac{1}{n}\sum_{i = 1}^{n} (\ten{X}_i)_{(k)}\t{(\ten{X}_i)_{(k)}}, \qquad + \widehat{\mat{\Sigma}}_k(\ten{F}_Y) = \frac{1}{n}\sum_{i = 1}^{n} (\ten{F}_{y_i})_{(k)}\t{(\ten{F}_{y_i})_{(k)}}. +\end{displaymath} +Then, for every mode $k = 1, \ldots, r$, we compute the first $j = 1, \ldots, q_k$ eigen vectors $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\mat{v}_j(\widehat{\mat{\Sigma}}_k(\ten{F}_Y))$ and eigen values $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$, $\lambda_j(\widehat{\mat{\Sigma}}_k(\ten{X}))$ of the marginal covariance estimates. Let, +\begin{align*} + \mat{U}_k &= (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))), \\ + \mat{D}_k &= \diag(\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{X}))\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_{Y})), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{X}))\mat{v}_{q_k}(\widehat{\mat{\Sigma}}_k(\ten{F}_{Y}))), \\ + \mat{V}_k &= (\mat{v}_1(\widehat{\mat{\Sigma}}_1(\ten{F}_Y), \ldots, \mat{v}_{q_k}(\widehat{\mat{\Sigma}}_{q_k}(\ten{F}_Y)). \\ +\end{align*} +The initial estimates are then set to +\begin{displaymath} + \hat{\mat{\beta}}_k^{(0)} = \mat{U}_k\sqrt{\mat{D}_k}\t{\mat{V}_k}. +\end{displaymath} +The initial values of $\mat{\Omega}_k$ are set to the identity $\mat{\Omega}_k^{(0)} = \mat{I}_{p_k}$. + +Given any estimates $\hat{\mat{\beta}}_1, \ldots, \hat{\mat{\beta}}_r, \hat{\mat{\Omega}}_1, \ldots, \hat{\mat{\Omega}}_r$, we take the gradient $\nabla_{\mat{\beta}_j}$ of the log-likelihood $l_n$ in \eqref{eq:log-likelihood} of the tensor normal distributed from \cref{thm:grad} and keep all other parameters except $\mat{\beta}_j$ fixed. Then, $\nabla_{\mat{\beta}_j}l_n = 0$ has a closed form solution +\begin{equation}\label{eq:tensor_normal_beta_solution} + \t{\mat{\beta}_j} = \biggl( + \sum_{i = 1}^{n} + \Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\Omega}}_k^{-1}\hat{\mat{\beta}}_k \Bigr)_{(j)} + \t{\Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\beta}}_k \Bigr)_{(j)}} + \biggr)^{-1} + \biggl( + \sum_{i = 1}^{n} + \Bigl( \ten{F}_{y_i}\mlm_{k \neq j}\hat{\mat{\beta}}_k \Bigr)_{(j)} + \t{(\ten{X}_{i})_{(j)}} + \biggr) + \hat{\mat{\Omega}}_j. +\end{equation} +For the scatter matrices $\mat{\Omega}_j$, we need to fudge a bit. Equating the partial gradient of the $j$'th scatter matrix $\mat{\Omega}_j$ in \cref{thm:grad} to zero, $\nabla_{\mat{\Omega}_j}l_n = 0$, gives a quadratic matrix equation. This is due to the dependence of $\ten{\mu}_y$ on $\mat{\Omega}_j$. In practive thoug, it is more stable, faster and equaly accurate to use mode-wise covariance estimates via the residuals +\begin{displaymath} + \hat{\ten{R}}_i = \ten{X}_i - \hat{\ten{\mu}}_{y_i} = \ten{X}_i - \ten{F}_{y_i}\mlm_{k = 1}^{r}\hat{\mat{\Omega}}_k^{-1}\hat{\mat{\beta}}_k. +\end{displaymath} +The estimates are computed via the inbetween values +\begin{displaymath} + \tilde{\mat{\Sigma}}_j = \sum_{i = 1}^n (\hat{\ten{R}}_i)_{(j)} \t{(\hat{\ten{R}}_i)_{(j)}} +\end{displaymath} +which only lacks scaling $\tilde{s}\tilde{\mat{\Sigma}}_j = \hat{\mat{\Omega}}_j^{-1}$. The scaling condition is that the mean squared error has to be equal to the trace of the covariance estimate, +\begin{displaymath} + \frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle = \tr\bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k^{-1} = \prod_{k = 1}^{r}\tr{\hat{\mat{\Omega}}_k^{-1}} = \tilde{s}^r\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k}. +\end{displaymath} +Solving for the scaling value gives +\begin{displaymath} + \tilde{s} = \biggl(\Bigl(\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k}\Bigr)^{-1}\frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle\biggr)^{1 / r} +\end{displaymath} +resulting in the estimates $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{-1}$. + +Estimation is then performed by updating for $j = 1, \ldots, r$ the estimates $\hat{\mat{\beta}}_j$ via \eqref{eq:tensor_normal_beta_solution}, then we recompute the $\hat{\mat{\Omega}}_j$ estimates simultaniously keeping the $\hat{\mat{\beta}}_j$'s fixed. This procedure is repeated untill convergence. Convergence is very fast, experiments showed that convergence occures usualy in less than $10$ iterations. + +A technical detail for numerical stability is to ensure that the scaled inbetween values $\tilde{s}\tilde{\mat{\Sigma}}_j$, assumed to be symmetric and positive definite, are well conditioned. Otherwise, the procedure can be numerically unstable by attempting to compute $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{-1}$. Therefore, we estimate the condition number of $\tilde{s}\tilde{\mat{\Sigma}}_j$ before computing the inverse. In case of being ill conditioned, we regularize and use $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j + 0.2 \lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)\mat{I}_{p_j})^{-1}$ instead, where $\lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)$ is the first (maximum) eigen value. Experiments showed that this regularization is usually required in the first few iterations only. + +Furthermore, we may assume that the parameter space follows a more generel setting as in \cref{thm:param-manifold}. In this case, updating may produces estimates outside of the parameter space. A simple and efficient method is to project every updated estimate onto the corresponding manifold. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Ising Model}\label{sec:ising_estimation} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newpage @@ -674,6 +715,8 @@ As a basis to ensure that the constraint parameter space $\Theta$ is a manifold, then the constrained parameter space $\Theta = \mathbb{R}^p \times \manifold{K}_{\mat{B}}\times\manifold{CK}_{\mat{\Omega}}\subset\mathbb{R}^{p(p + 2 q + 3) / 2}$ is a smooth embedded manifold. \end{theorem} +\subsection{Matrix Manifolds}\label{sec:matrix-manifolds} + {\color{red} Flexibility of Manifold Structure @@ -716,55 +759,6 @@ It is not necessary to have a perfect maximizer, as long as the objective has fi with asymptotic variance-covariance structure $\mat{\Sigma}_{\mat{\theta}_0}$ given in \eqref{eq:asymptotic-covariance-gmlm}. \end{theorem} -% \begin{theorem}[{\textcite[Thm~5.35]{asymStats-van_der_Vaart1998}}] -% Let $\{ p_{\mat{\theta}} : \mat{\theta}\in\Theta \}$ be a collection of identifiable subprobability\footnote{A subprobability measure is a measure $\nu$ for the measure space $(\mathfrak{X}, \nu)$ such that $\nu(\mathfrak{X}) \leq 1$.} densities such that $P_{\mat{\theta}_0}$ is a probability measure. Then $M(\mat{\theta}) = P_{\mat{\theta}_0}\log p_{\mat{\theta}} / p_{\mat{\theta}_0}$ attains its maximum uniquely at $\mat{\theta}_0$. -% \end{theorem} - -% \begin{proof}[Proof of \cref{thm:asymptotic-normality-gmlm}] -% First we prove the consistency for which we apply \cref{thm:consistency-on-metric-spaces} and then prove that there - -% Most parts of the proof are identical to the proof \textcite[Thm~3.1]{asymptoticMLE-BuraEtAl2018} with addaptations in a few places. - -% {\color{green}Bla Bla Bla ...} - -% \todo{regularity conditions!} - -% By rewriting \eqref{eq:eta1} as -% \begin{displaymath} -% \mat{\eta}_{1y} = -% \vec{\overline{\ten{\eta}}} + \mat{B}\vec{\ten{F}_y} = (\vec{\overline{\ten{\eta}}}, \mat{B})\begin{pmatrix} -% 1 \\ \vec{\ten{F}_y} -% \end{pmatrix} = (\mat{I}_p\otimes \t{(1, \vec{\ten{F}_y})})\vec((\vec{\overline{\ten{\eta}}}, \mat{B})) -% \end{displaymath} -% the natural parameter $\mat{\eta}_y$ with components \eqref{eq:eta1}, \eqref{eq:eta2} can be written with $\mat{\theta} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vec{\mat{\Omega}})$ as -% \begin{displaymath} -% \mat{\eta}_y = \begin{pmatrix} -% \mat{I}_p\otimes \t{(1, \vec{\ten{F}_y})} & 0 \\ -% 0 & c\t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})} -% \end{pmatrix}\mat{\theta} = \mat{F}(y)\mat{\theta} -% \end{displaymath} -% Using the new form, let with $z = (\ten{X}, y)$ -% \begin{displaymath} -% m_{\mat{\theta}}(z) = \langle \mat{F}(y)\mat{\theta}, \mat{t}(\ten{X}) \rangle - b(\mat{F}(y)\mat{\theta}) -% \end{displaymath} -% which allows to write the log-likelihood \eqref{eq:log-likelihood}, scaled by $n^{-1}$, with $Z_i = (\ten{X}_i, Y_i)$, as -% \begin{displaymath} -% M_n(\mat{\theta}) = \frac{1}{n}\sum_{i = 1}^{n} m_{\mat{\theta}}(Z_i) = \frac{1}{n} l_n(\mat{\theta}). -% \end{displaymath} - -% The strict convexity of $b$ ensures that $\mat{\theta}\mapsto m_{\mat{\theta}}(z)$ is strictly concave for every fixed $z$. Taking expectation preserves the concavity yielding $M(\mat{\theta}) = \E m_{\mat{\theta}}(Z)$ to be concave aswell. The identifiability \todo{???} of \todo{???} allows to apply \textcite[Thm~5.35]{asymStats-van_der_Vaart1998} which gives for the family of conditional densities $\{ f_{\mat{\theta}}(\,.\mid Y) : \mat{\theta}\in\Theta \}$ that -% \begin{displaymath} -% \E[m_{\mat{\theta}}(Z)\mid Y] \leq \E[m_{\mat{\theta}_0}(Z)\mid Y]. -% \end{displaymath} -% Taking the expectation with respect to $Y$ gives $M(\mat{\theta}) = \E[m_{\mat{\theta}}(Z)] \leq \E[m_{\mat{\theta}_0}(Z)] = M(\mat{\theta}_0)$ for every $\mat{\theta}$. Now, assume their exists a $\mat{\theta}_1$ such that $M(\mat{\theta}_1) = M(\mat{\theta}_0)$, then $P(\mat{F}(Y)\mat{\theta}_1 = \mat{F}(Y)\mat{\theta}_0) = 1$ which means \todo{figure that out why} which contradicts \todo{$P(\mat{F}(Y)\mat{\theta} = \overline{\ten{\eta}}) = 1$ which is a violaten of an assumption.} Finally, with the integrability assumtion we can applying \textcite[Thm~A.1]{asymptoticMLE-BuraEtAl2018} which gives that the MLE $\hat{\mat{\theta}}_n$ of \eqref{eq:mle} exists and is unique.\todo{Fix that!!!} - - -% {\color{green}Foo Bar Baz ...} - -% \todo{finish me!} -% \end{proof} - - %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Asymptotic Normality} @@ -814,38 +808,96 @@ The following is a reformulation of \textcite[Lemma~2.3]{asymptoticMLE-BuraEtAl2 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% We compair our method with a few other methods for the tensor normal and the Ising model (inverse Ising problem). -\subsection{Tensor Normal Distribution} -The \emph{tensor normal distribution} $\mathcal{TN}(\ten{\mu}, \mat{\Sigma}_1, ..., \mat{\Sigma}_r)$ is a generalization of the \emph{matrix normal distribution} $\mathcal{MN}(\mat{\mu}, \mat{\Sigma}_1, \mat{\Sigma}_2)$. The density of the conditional tensor normal distribution $\ten{X}\mid Y = y$ is given by -\begin{displaymath} - f_{\mat{\theta}_y}(\ten{X}\mid Y = y) = (2\pi)^{-p/2}\prod_{k = 1}^r |\mat{\Sigma}_{k}|^{-p / 2 p_{k}}\exp\Big( - -\frac{1}{2}\Big\langle \ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k\in[r]}\mat{\Sigma}_{k} \Big\rangle - \Big) -\end{displaymath} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Tensor Normal} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Ising Model} + +\begin{itemize} + \item[a] A simple setup with linear relation for + \item[b] + \item[c] + \item[d] + \item[e] Missspecified model ....... +\end{itemize} \begin{figure} \centering - \begin{scaletikzpicturetowidth}{0.5\textwidth} - \input{plots/sim-normal.tex} - \end{scaletikzpicturetowidth} - \caption{\label{fig:sim-normal}As a test case we show the $3$-tensor normal (generalization of the matrix normal)} + \includegraphics[width = \textwidth]{plots/sim-normal.pdf} + \caption{\label{fig:sim-normal}asknclknasknc} \end{figure} -\subsection{Matrix Normal and Ising Model} -If $\mat{X} \in \mathbb{R}^{p_1 \times p_2}$, then $\mat{\theta} = (\overline{\eta}, \mat{\beta}_1, \mat{\beta}_2, \mat{\Omega}_1, \mat{\Omega}_2)$ and $\mat{F}_y$ is also matrix valued. The conditional pdf of $\mat{X}\mid Y$ is -\begin{align*} - f_{\mat{\theta}}(\mat{X}\mid Y = y) - &= h(\mat{X})\exp(\langle\mat{X}, \mat{\eta}_1(\mat{\theta})\rangle + \langle\mat{X}\circ\mat{X}, \mat{\eta}_2(\mat{\theta})\rangle - b(\mat{\eta}_y(\mat{\theta}))) \\ - &= h(\mat{X})\exp(\tr((\overline{\mat{\eta}} + \mat{\beta}_1\mat{F}_y\t{\mat{\beta}_2})\t{\mat{X}}) + \tr(\mat{\Omega}_1\mat{X}\mat{\Omega}_2\t{\mat{X}}) - b(\mat{\eta}_y(\mat{\theta}))). -\end{align*} -The MLE estimate $\hat{\mat{\theta}}_n = (\widehat{\overline{\mat{\eta}}}, \widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1, \widehat{\mat{\Omega}}_2\otimes \widehat{\mat{\Omega}}_2)$ is asymptotically normal and -\begin{displaymath} - \widehat{\ten{R}}(\mat{X}) = \t{(\widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1)}\vec(\mat{X} - \E\mat{X}) \equiv \t{\widehat{\mat{\beta}}_1}(\mat{X} - \E\mat{X})\widehat{\mat{\beta}}_2 -\end{displaymath} -is the MLE of the sufficient reduction $\ten{R}(\mat{X})$ of dimension $q_1 \times q_2 \leq p_1 \times p_2 $.\\[1.6em] +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Ising Model} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\begin{figure} + \centering + \includegraphics[]{plots/sim-ising.pdf} + \caption{\label{fig:sim-ising}asknclknasknc} +\end{figure} + +\begin{table} + \begin{tabular}{c | ccc ccc c} + $n$ & GMLM & PCA & HOPCA & LPCA & CLPCA & TSIR & MGCCA \\ + \hline + 100 & {\bf 0.34} (0.14) & 0.90 (0.04) & 0.90 (0.05) & 0.94 (0.09) & 0.91 (0.03) & 0.48 (0.19) & 0.55 (0.13) \\ + 200 & {\bf 0.25} (0.11) & 0.90 (0.03) & 0.90 (0.03) & 0.96 (0.07) & 0.91 (0.02) & 0.38 (0.16) & 0.53 (0.10) \\ + 300 & {\bf 0.20} (0.09) & 0.89 (0.02) & 0.89 (0.02) & 0.97 (0.06) & 0.91 (0.02) & 0.29 (0.13) & 0.51 (0.11) \\ + 500 & {\bf 0.16} (0.07) & 0.90 (0.02) & 0.90 (0.02) & 0.98 (0.01) & 0.91 (0.01) & 0.23 (0.10) & 0.50 (0.08) \\ + 750 & {\bf 0.13} (0.05) & 0.90 (0.01) & 0.90 (0.01) & 0.98 (0.02) & 0.91 (0.01) & 0.23 (0.08) & 0.53 (0.06) + \end{tabular} + \caption{\label{tab:sim-ising}xyz uvw} +\end{table} + +% The \emph{tensor normal distribution} $\mathcal{TN}(\ten{\mu}, \mat{\Sigma}_1, ..., \mat{\Sigma}_r)$ is a generalization of the \emph{matrix normal distribution} $\mathcal{MN}(\mat{\mu}, \mat{\Sigma}_1, \mat{\Sigma}_2)$. \todo{ref} The density of the conditional tensor normal distribution $\ten{X}\mid Y = y$ according to the quadratic exponential family \eqref{eq:quadratic-exp-fam} where only the first moment depends on $y$ is given by +% \begin{displaymath} +% f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p/2}\prod_{k = 1}^r |\mat{\Sigma}_{k}|^{-p / 2 p_{k}}\exp\Big( +% -\frac{1}{2}\Big\langle \ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k\in[r]}\mat{\Sigma}_{k} \Big\rangle +% \Big) +% \end{displaymath} +% Rewriting this in the form of an exponential family as in \eqref{eq:quadratic-exp-fam} allows to determin the natural parameter components $\mat{\eta}_{y1}$ and $\mat{\eta}_2$. Since a non-degenerate normal distribution requires the covariance matrices $\mat{\Sigma}_k$ to be symmetric positive definite the relation to the second moment natural parameter $\mat{\eta}_2$ simplifies as we can set $\mat{T}_2$ in \eqref{eq:eta2-manifold} to the identity. This then gives the relation to the natural parameters as in \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold} as +% \begin{displaymath} +% \mat{\eta}_{1y} = \vec\Bigl(\ten{\mu}_y\mlm_{k = 1}^{r}\mat{\Sigma}_k\Bigr), \qquad +% \mat{\eta}_2 = c\t{\mat{D}_p}\vec\bigkron_{k = r}^{1}\mat{\Sigma}_k^{-1} +% \end{displaymath} +% with scaling constant $c = -1 / 2$. Modeling the natural parameters as in \eqref{eq:eta1} and \eqref{eq:eta2} relates the mean $\ten{\mu}_y$ and the covariance matrices $\mat{\Sigma}_k$ of the tensor normal to the generalized multi-linear model parameter $\overline{\ten{\eta}}$ and $\mat{\beta}_k$, $\mat{\Omega}_k$, for $k = 1, \ldots, r$ through +% \begin{displaymath} +% \ten{\mu}_y = \Bigl(\overline{\ten{\eta}} + \ten{F}_y\mlm_{j = 1}^{r}\mat{\beta}_j\Bigr)\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}, \qquad \mat{\Omega}_k = \mat{\Sigma}_k^{-1}. +% \end{displaymath} +% This completely determines the tensor normal distribution given the GMLM parameter. + + + + +% To estimate the GMLM parameters $\mat{\theta} = (\overline{\ten{\eta}}, \mat{\beta}_1, \ldots, \mat{\beta}_r, \mat{\Omega}_1, \ldots\mat{\Omega}_r)$ given a data set $(\ten{X}_i, y_i)$ of $i = 1, \ldots, n$ observation we use the gradients provided by \cref{thm:grad}. It turns out that the equations $\nabla_{\overline{\ten{\eta}}}l_n = 0, \nabla_{\mat{\beta}_j}l_n = 0$ and $\nabla_{\mat{\Omega}_j}l_n = 0$, for $j = 1, \ldots, r$, can be solved for the differentiation variable assuming all the other parameter blocks to be constant. Centering the observed $\ten{X}$ leads to a cleaner formulation given by \todo{fix the following!} +% \begin{align*} +% \hat{\overline{\ten{\eta}}} &= \frac{1}{n}\sum_{i = 1}^{n} \ten{X}_i, \\ +% \t{\hat{\mat{\beta}}_j} &= \mat{\Omega}_j \Bigl(\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\Omega}_k^{-1}\mat{\beta}_k\Bigr)_{(j)}\t{\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\beta}_k\Bigr)_{(j)}}\Bigr)^{-1}\Bigl(\ten{F}_{y_i}\mlm_{k \neq j}\mat{\beta}_k\Bigr)_{(j)}\ten{X}_{(j)}, \\ +% \t{\Omega}_j &= scaling \frac{1}{n} +% \end{align*} + +% This allows to use a \emph{block coordinate descent} method instead of gradient descent. This method keeps all but one parameter block fixed and optimized the objective for a single block. Given a closed form solution for the partial gradients, only a single update is required to solve the partial optimization problem. This means that the block coordinate descent method reduces to a cyclic updating. This not only converges very fast, it also does not require any hyper parameters. + +% For this iterative scheme we do need some initial estimates. For those we + + + +% \subsection{Matrix Normal and Ising Model} + +% If $\mat{X} \in \mathbb{R}^{p_1 \times p_2}$, then $\mat{\theta} = (\overline{\eta}, \mat{\beta}_1, \mat{\beta}_2, \mat{\Omega}_1, \mat{\Omega}_2)$ and $\mat{F}_y$ is also matrix valued. The conditional pdf of $\mat{X}\mid Y$ is +% \begin{align*} +% f_{\mat{\theta}}(\mat{X}\mid Y = y) +% &= h(\mat{X})\exp(\langle\mat{X}, \mat{\eta}_1(\mat{\theta})\rangle + \langle\mat{X}\circ\mat{X}, \mat{\eta}_2(\mat{\theta})\rangle - b(\mat{\eta}_y(\mat{\theta}))) \\ +% &= h(\mat{X})\exp(\tr((\overline{\mat{\eta}} + \mat{\beta}_1\mat{F}_y\t{\mat{\beta}_2})\t{\mat{X}}) + \tr(\mat{\Omega}_1\mat{X}\mat{\Omega}_2\t{\mat{X}}) - b(\mat{\eta}_y(\mat{\theta}))). +% \end{align*} +% The MLE estimate $\hat{\mat{\theta}}_n = (\widehat{\overline{\mat{\eta}}}, \widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1, \widehat{\mat{\Omega}}_2\otimes \widehat{\mat{\Omega}}_2)$ is asymptotically normal and +% \begin{displaymath} +% \widehat{\ten{R}}(\mat{X}) = \t{(\widehat{\mat{\beta}}_2\otimes \widehat{\mat{\beta}}_1)}\vec(\mat{X} - \E\mat{X}) \equiv \t{\widehat{\mat{\beta}}_1}(\mat{X} - \E\mat{X})\widehat{\mat{\beta}}_2 +% \end{displaymath} +% is the MLE of the sufficient reduction $\ten{R}(\mat{X})$ of dimension $q_1 \times q_2 \leq p_1 \times p_2 $.\\[1.6em] %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \appendix diff --git a/LaTeX/plots/sim-ising.tex b/LaTeX/plots/sim-ising.tex new file mode 100644 index 0000000..7d56b21 --- /dev/null +++ b/LaTeX/plots/sim-ising.tex @@ -0,0 +1,151 @@ +%%% R code to generate the input data files from corresponding simulation logs +% R> setwd("~/Work/tensorPredictors") +% R> +% R> for (sim.name in c("2a")) { +% R> pattern <- paste0("sim\\_", sim.name, "\\_ising\\-[0-9T]*\\.csv") +% R> log.file <- sort( +% R> list.files(path = "sim/", pattern = pattern, full.names = TRUE), +% R> decreasing = TRUE +% R> )[[1]] +% R> +% R> sim <- read.csv(log.file) +% R> +% R> aggr <- aggregate(sim[, names(sim) != "sample.size"], list(sample.size = sim$sample.size), mean) +% R> +% R> write.table(aggr, file = paste0("LaTeX/plots/aggr-", sim.name, "-ising.csv"), row.names = FALSE, quote = FALSE) +% R> } +\documentclass[border=0cm]{standalone} + +\usepackage{tikz} +\usepackage{pgfplots} +\usepackage{bm} + +\definecolor{gmlm}{RGB}{0,0,0} +\definecolor{mgcca}{RGB}{86,180,233} +\definecolor{tsir}{RGB}{0,158,115} +\definecolor{pca}{RGB}{240,228,66} +\definecolor{hopca}{RGB}{230,159,0} +\definecolor{lpca}{RGB}{127,127,127} +\definecolor{clpca}{RGB}{191,191,191} + +\pgfplotsset{ + every axis/.style={ + xtick={100,200,300,500,750}, + ymin=-0.05, ymax=1.05, + grid=both, + grid style={gray, dotted} + }, + every axis plot/.append style={ + mark = *, + mark size = 1pt, + line width=0.8pt + } +} +\tikzset{ + legend entry/.style={ + mark = *, + mark size = 1pt, + mark indices = {2}, + line width=0.8pt + } +} + +\begin{document} +\begin{tikzpicture}[>=latex] + + \begin{axis}[ + name=sim-2a + ] + \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1a-normal.csv}; + \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-2a-ising.csv}; + \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-2a-ising.csv}; + \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-2a-ising.csv}; + \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-2a-ising.csv}; + \addplot[color = lpca] table[x = sample.size, y = dist.subspace.lpca] {aggr-2a-ising.csv}; + \addplot[color = clpca] table[x = sample.size, y = dist.subspace.clpca] {aggr-2a-ising.csv}; + \end{axis} + \node[anchor = base west, yshift = 0.3em] at (sim-2a.north west) { + a: small + }; + + % \begin{axis}[ + % name=sim-1b, + % anchor=north west, at={(sim-2a.right of north east)}, xshift=0.1cm, + % xticklabel=\empty, + % ylabel near ticks, yticklabel pos=right + % ] + % \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1b-normal.csv}; + % \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1b-normal.csv}; + % \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1b-normal.csv}; + % \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1b-normal.csv}; + % \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1b-normal.csv}; + % \end{axis} + % \node[anchor = base west, yshift = 0.3em] at (sim-1b.north west) { + % b: cubic dependence on $y$ + % }; + + % \begin{axis}[ + % name=sim-1c, + % anchor=north west, at={(sim-2a.below south west)}, yshift=-.8em, + % xticklabel=\empty + % ] + % \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1c-normal.csv}; + % \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1c-normal.csv}; + % \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1c-normal.csv}; + % \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1c-normal.csv}; + % \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1c-normal.csv}; + % \end{axis} + % \node[anchor = base west, yshift = 0.3em] at (sim-1c.north west) { + % c: rank $1$ $\boldsymbol{\beta}$'s + % }; + + % \begin{axis}[ + % name=sim-1d, + % anchor=north west, at={(sim-1c.right of north east)}, xshift=0.1cm, + % ylabel near ticks, yticklabel pos=right + % ] + % \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1d-normal.csv}; + % \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1d-normal.csv}; + % \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1d-normal.csv}; + % \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1d-normal.csv}; + % \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1d-normal.csv}; + % \end{axis} + % \node[anchor = base west, yshift = 0.3em] at (sim-1d.north west) { + % d: tri-diagonal $\boldsymbol{\Omega}$'s + % }; + + % \begin{axis}[ + % name=sim-1e, + % anchor=north west, at={(sim-1c.below south west)}, yshift=-.8em + % ] + % \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1e-normal.csv}; + % \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1e-normal.csv}; + % \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1e-normal.csv}; + % \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1e-normal.csv}; + % \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1e-normal.csv}; + % \end{axis} + % \node[anchor = base west, yshift = 0.3em] at (sim-1e.north west) { + % e: missspecified + % }; + + + \matrix[anchor = west] at (sim-2a.right of east) { + \draw[color=gmlm, legend entry, line width=1pt] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {GMLM}; \\ + \draw[color=tsir, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {TSIR}; \\ + \draw[color=mgcca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {MGCCA}; \\ + \draw[color=hopca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {HOPCA}; \\ + \draw[color=pca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {PCA}; \\ + \draw[color=lpca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {LPCA}; \\ + \draw[color=clpca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {CLPCA}; \\ + }; + + + \node[anchor = north] at (current bounding box.south) {Sample Size $n$}; + + \node[anchor = south, rotate = 90] at (current bounding box.west) {Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$}; + \node[anchor = south, rotate = 270] at (current bounding box.east) {\phantom{Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$}}; + + \node[anchor = south, font=\large] at (current bounding box.north) {Tensor Normal Simulation}; + +\end{tikzpicture} +\end{document} diff --git a/LaTeX/plots/sim-normal.tex b/LaTeX/plots/sim-normal.tex new file mode 100644 index 0000000..54be60f --- /dev/null +++ b/LaTeX/plots/sim-normal.tex @@ -0,0 +1,148 @@ +%%% R code to generate the input data files from corresponding simulation logs +% R> setwd("~/Work/tensorPredictors") +% R> +% R> for (sim.name in c("1a", "1b", "1c", "1d", "1e")) { +% R> pattern <- paste0("sim\\_", sim.name, "\\_normal\\-[0-9T]*\\.csv") +% R> log.file <- sort( +% R> list.files(path = "sim/", pattern = pattern, full.names = TRUE), +% R> decreasing = TRUE +% R> )[[1]] +% R> +% R> sim <- read.csv(log.file) +% R> +% R> aggr <- aggregate(sim[, names(sim) != "sample.size"], list(sample.size = sim$sample.size), mean) +% R> +% R> write.table(aggr, file = paste0("LaTeX/plots/aggr-", sim.name, "-normal.csv"), row.names = FALSE, quote = FALSE) +% R> } +\documentclass[border=0cm]{standalone} + +\usepackage{tikz} +\usepackage{pgfplots} +\usepackage{bm} + +\definecolor{gmlm}{RGB}{0,0,0} +\definecolor{mgcca}{RGB}{86,180,233} +\definecolor{tsir}{RGB}{0,158,115} +\definecolor{hopca}{RGB}{230,159,0} +\definecolor{pca}{RGB}{240,228,66} +\definecolor{lpca}{RGB}{0,114,178} +\definecolor{clpca}{RGB}{213,94,0} + +\pgfplotsset{ + every axis/.style={ + xtick={100,200,300,500,750}, + ymin=-0.05, ymax=1.05, + grid=both, + grid style={gray, dotted} + }, + every axis plot/.append style={ + mark = *, + mark size = 1pt, + line width=0.8pt + } +} +\tikzset{ + legend entry/.style={ + mark = *, + mark size = 1pt, + mark indices = {2}, + line width=0.8pt + } +} + +\begin{document} +\begin{tikzpicture}[>=latex] + + \begin{axis}[ + name=sim-1a, + xticklabel=\empty + ] + \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1a-normal.csv}; + \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1a-normal.csv}; + \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1a-normal.csv}; + \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1a-normal.csv}; + \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1a-normal.csv}; + \end{axis} + \node[anchor = base west, yshift = 0.3em] at (sim-1a.north west) { + a: linear dependence on $\mathcal{F}_y \equiv y$ + }; + + \begin{axis}[ + name=sim-1b, + anchor=north west, at={(sim-1a.right of north east)}, xshift=0.1cm, + xticklabel=\empty, + ylabel near ticks, yticklabel pos=right + ] + \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1b-normal.csv}; + \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1b-normal.csv}; + \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1b-normal.csv}; + \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1b-normal.csv}; + \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1b-normal.csv}; + \end{axis} + \node[anchor = base west, yshift = 0.3em] at (sim-1b.north west) { + b: cubic dependence on $y$ + }; + + \begin{axis}[ + name=sim-1c, + anchor=north west, at={(sim-1a.below south west)}, yshift=-.8em, + xticklabel=\empty + ] + \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1c-normal.csv}; + \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1c-normal.csv}; + \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1c-normal.csv}; + \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1c-normal.csv}; + \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1c-normal.csv}; + \end{axis} + \node[anchor = base west, yshift = 0.3em] at (sim-1c.north west) { + c: rank $1$ $\boldsymbol{\beta}$'s + }; + + \begin{axis}[ + name=sim-1d, + anchor=north west, at={(sim-1c.right of north east)}, xshift=0.1cm, + ylabel near ticks, yticklabel pos=right + ] + \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1d-normal.csv}; + \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1d-normal.csv}; + \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1d-normal.csv}; + \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1d-normal.csv}; + \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1d-normal.csv}; + \end{axis} + \node[anchor = base west, yshift = 0.3em] at (sim-1d.north west) { + d: tri-diagonal $\boldsymbol{\Omega}$'s + }; + + \begin{axis}[ + name=sim-1e, + anchor=north west, at={(sim-1c.below south west)}, yshift=-.8em + ] + \addplot[color = pca] table[x = sample.size, y = dist.subspace.pca] {aggr-1e-normal.csv}; + \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1e-normal.csv}; + \addplot[color = tsir] table[x = sample.size, y = dist.subspace.tsir] {aggr-1e-normal.csv}; + \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1e-normal.csv}; + \addplot[color = gmlm, line width=1pt] table[x = sample.size, y = dist.subspace.gmlm] {aggr-1e-normal.csv}; + \end{axis} + \node[anchor = base west, yshift = 0.3em] at (sim-1e.north west) { + e: missspecified + }; + + + \matrix[anchor = center] at (sim-1e.right of east -| sim-1d.south) { + \draw[color=gmlm, legend entry, line width=1pt] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {GMLM}; \\ + \draw[color=tsir, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {TSIR}; \\ + \draw[color=mgcca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {MGCCA}; \\ + \draw[color=hopca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {HOPCA}; \\ + \draw[color=pca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {PCA}; \\ + }; + + + \node[anchor = north] at (current bounding box.south) {Sample Size $n$}; + + \node[anchor = south, rotate = 90] at (current bounding box.west) {Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$}; + \node[anchor = south, rotate = 270] at (current bounding box.east) {\phantom{Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$}}; + + \node[anchor = south, font=\large] at (current bounding box.north) {Tensor Normal Simulation}; + +\end{tikzpicture} +\end{document} diff --git a/tensorPredictors/NAMESPACE b/tensorPredictors/NAMESPACE index fe7a243..ba4463d 100644 --- a/tensorPredictors/NAMESPACE +++ b/tensorPredictors/NAMESPACE @@ -11,8 +11,8 @@ export(.projRank) export(.projSymBand) export(.projSymRank) export(CISE) -export(D) -export(D.pinv) +export(Dup) +export(Dup.pinv) export(GMLM) export(GMLM.default) export(HOPCA) @@ -32,6 +32,7 @@ export(RMReg) export(RMap) export(S) export(TSIR) +export(approx.kron) export(approx.kronecker) export(colKronecker) export(decompose.kronecker) @@ -41,6 +42,7 @@ export(dist.projection) export(dist.subspace) export(exprs.all.equal) export(gmlm_ising) +export(gmlm_mlm) export(gmlm_tensor_normal) export(icu_tensor_normal) export(ising_m2) @@ -72,6 +74,7 @@ export(reduce) export(riccati) export(rowKronecker) export(rtensornorm) +export(slice.assign.expr) export(slice.expr) export(slice.select) export(sylvester) diff --git a/tensorPredictors/R/approx_kronecker.R b/tensorPredictors/R/approx_kronecker.R index 5cc20b2..56b8595 100644 --- a/tensorPredictors/R/approx_kronecker.R +++ b/tensorPredictors/R/approx_kronecker.R @@ -1,3 +1,31 @@ +#' @examples +#' nrows <- c(3, 2, 5) +#' ncols <- c(2, 4, 8) +#' +#' A <- Reduce(kronecker, Map(matrix, Map(rnorm, nrows * ncols), nrows)) +#' +#' all.equal(A, Reduce(kronecker, approx.kron(A, nrows, ncols))) +#' +#' @export +approx.kron <- function(A, nrows, ncols) { + + # rearrange kronecker product `A` into outer product `R` + dim(A) <- c(rev(nrows), rev(ncols)) + axis.perm <- as.vector(t(matrix(seq_along(dim(A)), ncol = 2L))[, rev(seq_along(nrows))]) + R <- aperm(A, axis.perm, resize = FALSE) + dim(R) <- nrows * ncols + + # scaling factor for every product component + s <- sum(A^2)^(1 / length(dim(A))) + + # higher order SVD on `R` with one singular vector + Map(function(mode, nr, nc) { + s * `dim<-`(La.svd(mcrossprod(R, mode = mode), 1L, 0L)$u, c(nr, nc)) + }, seq_along(nrows), nrows, ncols) + +} + + #' Approximates Kronecker Product decomposition. #' #' Approximates the matrices `A` and `B` such that diff --git a/tensorPredictors/R/gmlm_ising.R b/tensorPredictors/R/gmlm_ising.R index c06c0d2..1edabdd 100644 --- a/tensorPredictors/R/gmlm_ising.R +++ b/tensorPredictors/R/gmlm_ising.R @@ -1,42 +1,110 @@ #' Specialized version of the GMLM for the Ising model (inverse Ising problem) #' +#' @todo TODO: Add beta and Omega projections +#' #' @export gmlm_ising <- function(X, F, sample.axis = length(dim(X)), + # proj.betas = ..., proj.Omegas = ..., # TODO: this max.iter = 1000L, eps = sqrt(.Machine$double.eps), - step.size = function(iter) 1e-2 * (1000 / (iter + 1000)), - zig.zag.threashold = 5L, - patience = 5L, - nr.slices = 10L, # only for univariate `F(y) = y` - slice.method = c("cut", "ecdf"), # only for univariate `F(y) = y` and `y` is a factor or integer + step.size = 1e-3, + zig.zag.threashold = 20L, + patience = 3L, + nr.slices = 20L, # only for univariate `F(y) = y` + slice.method = c("cut", "ecdf", "none"), # only for univariate `F(y) = y` and `y` is a factor or integer logger = function(...) { } ) { - # # Special case for univariate response `vec F(y) = y` - # # Due to high computational costs we use slicing - # if (length(F) == prod(dim(F))) { - # y <- as.vector(F) - # if (!(is.factor(y) || is.integer(y))) { - # slice.method <- match.arg(slice.method) - # if (slice.method == "ecdf") { - # y <- cut(ecdf(y)(y), nr.slices) - # } else { - # y <- cut(y, nr.slices) - # } - # } - # slices <- split(seq_len(sample.size), y, drop = TRUE) - # } else { - # slices <- seq_len(sample.size) - # } + # Get problem dimensions + dimX <- dim(X)[-sample.axis] + # threat scalar `F` as a tensor + if (is.null(dim(F))) { + dimF <- rep(1L, length(dimX)) + dim(F) <- ifelse(seq_along(dim(X)) == sample.axis, sample.size, 1L) + } else { + dimF <- dim(F)[-sample.axis] + } + sample.size <- dim(X)[sample.axis] - dimX <- head(dim(X), -1) - dimF <- head(dim(F), -1) - sample.axis <- length(dim(X)) - modes <- seq_len(length(dim(X)) - 1) - sample.size <- tail(dim(X), 1) + # rearrange `X`, `F` such that the last axis enumerates observations + if (sample.axis != length(dim(X))) { + axis.perm <- c(seq_along(dim(X))[-sample.axis], sample.axis) + X <- aperm(X, axis.perm) + F <- aperm(F, axis.perm) + sample.axis <- length(dim(X)) + } + modes <- seq_along(dimX) - betas <- Map(matrix, Map(rnorm, dimX * dimF), dimX) - Omegas <- Map(diag, dimX) + # Special case for univariate response `vec F(y) = y` + # Due to high computational costs we use slicing + slice.method <- match.arg(slice.method) + slices.ind <- if ((slice.method != "none") && (length(F) == prod(dim(F)))) { + y <- as.vector(F) + if (!(is.factor(y) || is.integer(y))) { + slice.method <- match.arg(slice.method) + if (slice.method == "ecdf") { + y <- cut(ecdf(y)(y), nr.slices) + } else { + y <- cut(y, nr.slices) + } + } + split(seq_len(sample.size), y, drop = TRUE) + } else { + seq_len(sample.size) + } + + # initialize betas with tensor normal estimate (ignoring data being binary) + fit_normal <- gmlm_tensor_normal(X, F, sample.axis = length(dim(X))) + betas <- fit_normal$betas + + Omegas <- Omegas.init <- Map(function(mode) { + n <- prod(dim(X)[-mode]) + prob2 <- mcrossprod(X, mode = mode) / n + prob2[prob2 == 0] <- 1 / n + prob1 <- diag(prob2) + `prob1^2` <- outer(prob1, prob1) + + `diag<-`(log(((1 - `prob1^2`) / `prob1^2`) * prob2 / (1 - prob2)), 0) + }, modes) + + # Determin degenerate combinations, that are variables which are exclusive + # in the data set + matX <- mat(X, sample.axis) + degen <- crossprod(matX) == 0 + degen.mask <- which(degen) + # If there are degenerate combination, compute an (arbitrary) bound the + # log odds parameters of those combinations + if (any(degen.mask)) { + degen.ind <- arrayInd(degen.mask, dim(degen)) + meanX <- colMeans(matX) + prodX <- meanX[degen.ind[, 1]] * meanX[degen.ind[, 2]] + degen.bounds <- log((1 - prodX) / (prodX * sample.size)) + # Component indices in Omegas of degenerate two-way interactions + degen.ind <- arrayInd(degen.mask, rep(dimX, 2)) + degen.ind <- Map(function(d, m) { + degen.ind[, m] + dimX[m] * (degen.ind[, m + length(dimX)] - 1L) + }, dimX, seq_along(dimX)) + + ## Enforce initial value degeneracy interaction param. constraints + # Extract parameters corresponding to degenerate interactions + degen.params <- do.call(rbind, Map(`[`, Omegas, degen.ind)) + # Degeneracy Constrained Parameters (sign is dropped) + DCP <- mapply(function(vals, bound) { + logVals <- log(abs(vals)) + err <- max(0, sum(logVals) - log(abs(bound))) + exp(logVals - (err / length(vals))) + }, split(degen.params, col(degen.params)), degen.bounds) + # Update values in Omegas such that all degeneracy constraints hold + Omegas <- Map(function(Omega, cp, ind) { + # Combine multiple constraints for every element into single + # constraint value per element + cp <- mapply(min, split(abs(cp), ind)) + ind <- as.integer(names(cp)) + `[<-`(Omega, ind, sign(Omega[ind]) * cp) + }, Omegas, split(DCP, row(DCP)), degen.ind) + } + + # Initialize mean squared gradients grad2_betas <- Map(array, 0, Map(dim, betas)) grad2_Omegas <- Map(array, 0, Map(dim, Omegas)) @@ -49,12 +117,8 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)), non_improving <- 0L # technical access points to dynamicaly access a multi-dimensional array - # with its last index - `X[..., i]` <- slice.expr(X, sample.axis, index = i) + `X[..., i]` <- slice.expr(X, sample.axis, index = i, drop = FALSE) `F[..., i]` <- slice.expr(F, sample.axis, index = i, drop = FALSE) - # the next expression if accessing the precomputed `mlm(F, betas)` - `BF[..., i]` <- slice.expr(BF, sample.axis, nr.axis = sample.axis, drop = FALSE) # BF[..., i] - # Iterate till a break condition triggers or till max. nr. of iterations for (iter in seq_len(max.iter)) { @@ -62,31 +126,36 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)), grad_betas <- Map(matrix, 0, dimX, dimF) Omega <- Reduce(kronecker, rev(Omegas)) + # second order residuals accumulator + # `sum_i (X_i o X_i - E[X o X | Y = y_i])` R2 <- array(0, dim = c(dimX, dimX)) # negative log-likelihood loss <- 0 - BF <- mlm(F, betas) + for (i in slices.ind) { + # slice size (nr. of objects in the slice) + n_i <- length(i) - for (i in seq_len(sample.size)) { - params_i <- Omega + diag(as.vector(eval(`BF[..., i]`))) + sumF_i <- rowSums(eval(`F[..., i]`), dims = length(dimF)) + + diag_params_i <- mlm(sumF_i / n_i, betas) + params_i <- Omega + diag(as.vector(diag_params_i)) m2_i <- ising_m2(params_i) # accumulate loss - x_i <- as.vector(eval(`X[..., i]`)) - loss <- loss - (sum(x_i * (params_i %*% x_i)) + log(attr(m2_i, "prob_0"))) + matX_i <- mat(eval(`X[..., i]`), modes) + loss <- loss - ( + sum(matX_i * (params_i %*% matX_i)) + n_i * log(attr(m2_i, "prob_0")) + ) - R2_i <- tcrossprod(x_i) - m2_i + R2_i <- tcrossprod(matX_i) - n_i * m2_i R1_i <- diag(R2_i) dim(R1_i) <- dimX for (j in modes) { grad_betas[[j]] <- grad_betas[[j]] + - mcrossprod( - R1_i, mlm(eval(`F[..., i]`), betas[-j], modes[-j]), j, - dimB = ifelse(j != modes, dimX, dimF) - ) + mcrossprod(R1_i, mlm(sumF_i, betas[-j], modes[-j]), j) } R2 <- R2 + as.vector(R2_i) } @@ -98,17 +167,14 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)), }, modes) - # update and accumulate alternating loss + # update optimization behavioral trackers accum_sign <- sign(last_loss - loss) - accum_sign - # check if accumulated alternating signs exceed stopping threshold + non_improving <- max(0L, non_improving - 1L + 2L * (last_loss < loss)) + + # check break conditions if (abs(accum_sign) > zig.zag.threashold) { break } - # increment non improving counter if thats the case - if (!(loss < last_loss)) { - non_improving <- non_improving + 1L - } else { - non_improving <- 0L - } if (non_improving > patience) { break } + if (abs(last_loss - loss) < eps * last_loss) { break } # store current loss for the next iteration last_loss <- loss @@ -122,17 +188,219 @@ gmlm_ising <- function(X, F, sample.axis = length(dim(X)), # logging (before parameter update) logger(iter, loss, betas, Omegas, grad_betas, grad_Omegas) - # gradualy decrease the step size - step <- if (is.function(step.size)) step.size(iter) else step.size - # Update Parameters betas <- Map(function(beta, grad, m2) { - beta + (step / (sqrt(m2) + eps)) * grad + beta + (step.size / (sqrt(m2) + eps)) * grad }, betas, grad_betas, grad2_betas) Omegas <- Map(function(Omega, grad, m2) { - Omega + (step / (sqrt(m2) + eps)) * grad + Omega + (step.size / (sqrt(m2) + eps)) * grad }, Omegas, grad_Omegas, grad2_Omegas) + + # Enforce degeneracy parameter constraints + if (any(degen.mask)) { + # Extract parameters corresponding to degenerate interactions + degen.params <- do.call(rbind, Map(`[`, Omegas, degen.ind)) + # Degeneracy Constrained Parameters (sign is dropped) + DCP <- mapply(function(vals, bound) { + logVals <- log(abs(vals)) + err <- max(0, sum(logVals) - log(abs(bound))) + exp(logVals - (err / length(vals))) + }, split(degen.params, col(degen.params)), degen.bounds) + # Update values in Omegas such that all degeneracy constraints hold + Omegas <- Map(function(Omega, cp, ind) { + # Combine multiple constraints for every element into single + # constraint value per element + cp <- mapply(min, split(abs(cp), ind)) + ind <- as.integer(names(cp)) + `[<-`(Omega, ind, sign(Omega[ind]) * cp) + }, Omegas, split(DCP, row(DCP)), degen.ind) + } } - list(betas = betas, Omegas = Omegas) + structure( + list(eta1 = array(0, dimX), betas = betas, Omegas = Omegas), + tensor_normal = fit_normal, + Omegas.init = Omegas.init, + degen.mask = degen.mask + ) +} + + + +################################################################################ +### Development Interactive Block (Delete / Make sim / TODO: ...) ### +################################################################################ +if (FALSE) { # interactive() + +par(bg = "#1d1d1d", + fg = "lightgray", + col = "#d5d5d5", + col.axis = "#d5d5d5", + col.lab = "#d5d5d5", + col.main = "#d5d5d5", + col.sub = "#d5d5d5", # col.sub = "#2467d0" + pch = 16 +) +cex <- 1.25 +col <- colorRampPalette(c("#f15050", "#1d1d1d", "#567DCA"))(256) + + +.logger <- function() { + iter <- 0L + assign("log", data.frame( + iter = rep(NA_integer_, 100000), + loss = rep(NA_real_, 100000), + dist.B = rep(NA_real_, 100000), + dist.Omega = rep(NA_real_, 100000), + norm.grad.B = rep(NA_real_, 100000), + norm.grad.Omega = rep(NA_real_, 100000) + ), envir = .GlobalEnv) + assign("B.gmlm", NULL, .GlobalEnv) + assign("Omega.gmlm", NULL, .GlobalEnv) + + function(it, loss, betas, Omegas, grad_betas, grad_Omegas) { + # Store in global namespace (allows to stop and get the results) + B.gmlm <- Reduce(kronecker, rev(betas)) + assign("B.gmlm", B.gmlm, .GlobalEnv) + Omega.gmlm <- Reduce(kronecker, rev(Omegas)) + assign("Omega.gmlm", Omega.gmlm, .GlobalEnv) + + dist.B <- dist.subspace(B.true, B.gmlm, normalize = TRUE) + dist.Omega <- norm(Omega.true - Omega.gmlm, "F") + norm.grad.B <- sqrt(sum(mapply(norm, grad_betas, "F")^2)) + norm.grad.Omega <- sqrt(sum(mapply(norm, grad_Omegas, "F")^2)) + + log[iter <<- iter + 1L, ] <<- list( + it, loss, dist.B, dist.Omega, norm.grad.B, norm.grad.Omega + ) + cat(sprintf("\r%3d - d(B): %.3f, d(O): %.3f, |g(B)|: %.3f, |g(O)|: %.3f, loss: %.3f\033[K", + it, dist.B, dist.Omega, norm.grad.B, norm.grad.Omega, loss)) + } +} + + +sample.size <- 1000 +dimX <- c(2, 3) # predictor `X` dimension +dimF <- rep(1, length(dimX)) # "function" `F(y)` of responce `y` dimension + +betas <- Map(diag, 1, dimX, dimF) +Omegas <- list(toeplitz(c(0, -2)), toeplitz(seq(1, 0, by = -0.5))) + +B.true <- Reduce(kronecker, rev(betas)) +Omega.true <- Reduce(kronecker, rev(Omegas)) + +# data sampling routine +c(X, F, y, sample.axis) %<-% (sample.data <- function(sample.size, betas, Omegas) { + dimX <- mapply(nrow, betas) + dimF <- mapply(ncol, betas) + + # generate response (sample axis is last axis) + y <- runif(prod(sample.size, dimF), -2, 2) + F <- array(y, dim = c(dimF, sample.size)) # ~ U[-1, 1] + + Omega <- Reduce(kronecker, rev(Omegas)) + + X <- apply(F, length(dim(F)), function(Fi) { + dim(Fi) <- dimF + params <- diag(as.vector(mlm(Fi, betas))) + Omega + tensorPredictors::ising_sample(1, params) + }) + dim(X) <- c(dimX, sample.size) + + list(X = X, F = F, y = y, sample.axis = length(dim(X))) +})(sample.size, betas, Omegas) + +local({ + X.proto <- array(seq_len(prod(dimX)), dimX) + interactions <- crossprod(mat(X, sample.axis)) + dimnames(interactions) <- rep(list( + do.call(paste0, c("X", Map(slice.index, list(X.proto), seq_along(dimX)))) + ), 2) + cat("Sample Size: ", sample.size, "\n") + print.table(interactions, zero.print = ".") +}) + +# system.time({ +# fit.gmlm <- gmlm_ising(X, y, logger = .logger()) +# }) +Rprof() +gmlm_ising(X, y) +Rprof(NULL) +summaryRprof() + +B.gmlm <- Reduce(kronecker, rev(fit.gmlm$betas)) +Omega.gmlm <- Reduce(kronecker, rev(fit.gmlm$Omegas)) + +B.normal <- Reduce(kronecker, rev(attr(fit.gmlm, "tensor_normal")$betas)) +Omega.init <- Reduce(kronecker, rev(attr(fit.gmlm, "Omegas.init"))) +degen.mask <- attr(fit.gmlm, "degen.mask") + +local({ + layout(matrix(c( + 1, 2, 3, 3, 3, + 1, 4, 5, 6, 7 + ), nrow = 2, byrow = TRUE), width = c(6, 3, 1, 1, 1)) + + with(na.omit(log), { + plot(range(iter), c(0, 1), type = "n", bty = "n", + xlab = "Iterations", ylab = "Distance") + + lines(iter, dist.B, col = "red", lwd = 2) + lines(iter, dist.Omega / max(dist.Omega), col = "blue", lwd = 2) + lines(iter, (loss - min(loss)) / diff(range(loss)), col = "darkgreen", lwd = 2) + + norm.grad <- sqrt(norm.grad.B^2 + norm.grad.Omega^2) + # Scale all gradient norms + norm.grad.B <- norm.grad.B / max(norm.grad) + norm.grad.Omega <- norm.grad.Omega / max(norm.grad) + norm.grad <- norm.grad / max(norm.grad) + lines(iter, norm.grad.B, lty = 2, col = "red") + lines(iter, norm.grad.Omega, lty = 2, col = "blue") + lines(iter, norm.grad, lty = 2, col = "darkgreen") + + axis(4, at = c( + tail(dist.B, 1), + min(dist.B) + ), labels = round(c( + tail(dist.B, 1), + min(dist.B) + ), 2), col = NA, col.ticks = "red", las = 1) + axis(4, at = c( + 1, + tail(dist.Omega, 1) / max(dist.Omega), + min(dist.Omega) / max(dist.Omega) + ), labels = round(c( + max(dist.Omega), + tail(dist.Omega, 1), + min(dist.Omega) + ), 2), col = NA, col.ticks = "blue", las = 1) + + abline(h = c(tail(dist.B, 1), min(dist.B)), + lty = "dotted", col = "red") + abline(h = c(max(dist.Omega), tail(dist.Omega, 1), min(dist.Omega)) / max(dist.Omega), + lty = "dotted", col = "blue") + + }) + legend("topright", col = c("red", "blue", "darkgreen"), lty = 1, lwd = 2, + legend = c("dist.B", "dist.Omega", "loss"), bty = "n") + + zlim <- max(abs(range(Omega.true, Omega.init, Omega.gmlm))) * c(-1, 1) + matrixImage(Omega.true, main = "true", zlim = zlim, add.values = TRUE, col = col, cex = cex) + matrixImage(round(Omega.init, 2), main = "init (cond. prob.)", zlim = zlim, add.values = TRUE, col = col, cex = cex) + mtext(round(norm(Omega.true - Omega.init, "F"), 3), 3) + matrixImage(round(Omega.gmlm, 2), main = "gmlm (ising)", zlim = zlim, add.values = TRUE, col = col, cex = cex, + col.values = c(par("col"), "red")[`[<-`(array(1, rep(prod(dim(X)[-sample.axis]), 2)), degen.mask, 2)]) + mtext(round(norm(Omega.true - Omega.gmlm, "F"), 3), 3) + + zlim <- max(abs(range(B.true, B.normal, B.gmlm))) * c(-1, 1) + matrixImage(B.true, main = "true", + zlim = zlim, add.values = TRUE, col = col, cex = cex) + matrixImage(round(B.normal, 2), main = "init (normal)", + zlim = zlim, add.values = TRUE, axes = FALSE, col = col, cex = cex) + mtext(round(dist.subspace(B.true, B.normal, normalize = TRUE), 3), 3) + matrixImage(round(B.gmlm, 2), main = "gmlm (ising)", + zlim = zlim, add.values = TRUE, axes = FALSE, col = col, cex = cex) + mtext(round(dist.subspace(B.true, B.gmlm, normalize = TRUE), 3), 3) +}) + } diff --git a/tensorPredictors/R/gmlm_tensor_normal.R b/tensorPredictors/R/gmlm_tensor_normal.R index 9d0e55c..533c166 100644 --- a/tensorPredictors/R/gmlm_tensor_normal.R +++ b/tensorPredictors/R/gmlm_tensor_normal.R @@ -1,109 +1,10 @@ - -# p <- 5 -# A <- matrix(rnorm(p^2), p) - -# mat.proj("TriDiag", p)(A) -# mat.proj("SymTriDiag", p)(A) - -# A -# (AA <- mat.proj("PSD", p)(A)) -# mat.proj("PSD", p)(AA) - - -# p <- 5 -# A <- matrix(rnorm(p^2), p) - -# projection <- function(T2) { -# P <- pinv(T2) %*% T2 -# function(A) { -# matrix(P %*% as.vector(A), nrow(A)) -# } -# } - - -# # All equal diagonal matrix, A = a * I_p -# T2 <- t(as.vector(diag(p))) -# proj <- projection(T2) - -# print.table( diag(mean(diag(A)), nrow(A), ncol(A)) , zero.print = ".") -# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p) , zero.print = ".") -# print.table( proj(A) , zero.print = ".") - - -# # Diagonal matrix, A = diag(a_1, ..., a_p) -# T2 <- matrix(seq_len(p^2), p) -# T2 <- outer(diag(T2), as.vector(T2), `==`) -# storage.mode(T2) <- "double" -# proj <- projection(T2) - -# print.table( T2 , zero.print = ".") - -# print.table( diag(diag(A)) , zero.print = ".") -# print.table( proj(A) , zero.print = ".") - - -# # Tri-Diagonal Matrix -# T2 <- matrix(seq_len(p^2), p) -# T2 <- outer(T2[abs(row(A) - col(A)) <= 1], as.vector(T2), `==`) -# storage.mode(T2) <- "double" - -# triDiag.mask <- (abs(row(A) - col(A)) <= 1) -# storage.mode(triDiag.mask) <- "double" - -# print.table( T2 , zero.print = ".") -# print.table( triDiag.mask , zero.print = ".") - -# print.table( A * triDiag.mask , zero.print = ".") -# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p) , zero.print = ".") - - - -# # All equal main and off diagonals -# T2 <- Reduce(rbind, Map(function(i) { -# as.double(row(A) == i + col(A)) -# }, (1 - p):(p - 1))) - -# print.table( T2 , zero.print = ".") - -# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p) , zero.print = ".") - - -# # Symmetric all equal main and off diagonals -# T2 <- Reduce(rbind, Map(function(i) { -# as.double(abs(row(A) - col(A)) == i) -# }, 0:(p - 1))) - -# print.table( T2 , zero.print = ".") - -# print.table( matrix((pinv(T2) %*% T2) %*% as.vector(A), p) , zero.print = ".") - - -# # Symetric Matrix -# index <- matrix(seq_len(p^2), p) -# T2 <- Reduce(rbind, Map(function(i) { -# e_i <- index == i -# as.double(e_i | t(e_i)) -# }, index[lower.tri(index, diag = TRUE)])) -# proj <- projection(T2) - -# print.table( T2 , zero.print = ".") - -# print.table( 0.5 * (A + t(A)) , zero.print = ".") -# print.table( proj(A) , zero.print = ".") - -# proj(solve(A)) -# solve(proj(A)) - - - - #' Specialized version of GMLM for the tensor normal model #' #' The underlying algorithm is an ``iterative (block) coordinate descent'' method #' #' @export gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)), - max.iter = 100L, proj.betas = NULL, proj.Omegas = NULL, logger = NULL, # TODO: proj.betas NOT USED!!! + max.iter = 100L, proj.betas = NULL, proj.Omegas = NULL, logger = NULL, cond.threshold = 25, eps = 1e-6 ) { # rearrange `X`, `F` such that the last axis enumerates observations @@ -152,7 +53,7 @@ gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)), }, mcov(F, sample.axis, center = FALSE)) # initialization of betas ``covariance direction mappings`` - betas <- Map(function(dX, dF) { + betas <- betas.init <- Map(function(dX, dF) { s <- min(ncol(dX), nrow(dF)) crossprod(dX[1:s, , drop = FALSE], dF[1:s, , drop = FALSE]) }, dirsX, dirsF) @@ -226,132 +127,8 @@ gmlm_tensor_normal <- function(X, F, sample.axis = length(dim(X)), } - list(eta1 = meanX, betas = betas, Omegas = Omegas) + structure( + list(eta1 = mlm(meanX, Sigmas), betas = betas, Omegas = Omegas), + betas.init = betas.init + ) } - - - -# #### DEBUGGING -# library(tensorPredictors) - -# # setup dimensions -# n <- 1e3 -# p <- c(5, 7, 3) -# q <- c(2, 5, 3) - -# max.iter <- 10L - -# # create "true" GLM parameters -# eta1 <- array(rnorm(prod(p)), dim = p) -# betas <- Map(matrix, Map(rnorm, p * q), p) -# Omegas <- Map(function(p_j) { -# solve(0.5^abs(outer(seq_len(p_j), seq_len(p_j), `-`))) -# }, p) -# true.params <- list(eta1 = eta1, betas = betas, Omegas = Omegas) - -# # compute tensor normal parameters from GMLM parameters -# Sigmas <- Map(solve, Omegas) -# mu <- mlm(eta1, Sigmas) - -# # sample some test data -# sample.axis <- length(p) + 1L -# F <- array(rnorm(n * prod(q)), dim = c(q, n)) -# X <- mlm(F, Map(`%*%`, Sigmas, betas)) + rtensornorm(n, mu, Sigmas, sample.axis) - -# # setup a logging callback -# format <- sprintf("iter: %%3d, %s, Omega: %%8.2f, resid: %%8.3f\n", -# paste0("beta.", seq_along(betas), ": %.3f", collapse = ", ") -# ) -# hist <- data.frame( -# iter = seq(0, max.iter), -# dist.B = numeric(max.iter + 1), -# dist.Omega = numeric(max.iter + 1), -# resid = numeric(max.iter + 1) -# ) -# logger <- function(iter, betas, Omegas, resid) { -# dist.B <- dist.kron.norm(true.params$betas, betas) -# dist.Omega <- dist.kron.norm(true.params$Omegas, Omegas) -# resid <- mean(resid^2) - -# hist[iter + 1, -1] <<- c(dist.B = dist.B, dist.Omega = dist.Omega, resid = resid) -# cat(do.call(sprintf, c( -# list(format, iter), -# Map(dist.subspace, betas, true.params$betas), -# dist.Omega, resid -# ))) -# } - -# # run the model -# est.params <- gmlm_tensor_normal(X, F, max.iter = max.iter, logger = logger) - -# # run model with Omegas in sub-manifolds of SPD matrices -# est.params <- gmlm_tensor_normal(X, F, max.iter = max.iter, logger = logger, -# proj.Omegas = list( -# NULL, -# (function(nrow, ncol) { -# triDiag.mask <- (abs(.row(c(nrow, ncol)) - .col(c(nrow, ncol))) <= 1) -# storage.mode(triDiag.mask) <- "double" -# function(A) { A * triDiag.mask } -# })(p[2], p[2]), -# function(A) diag(diag(A)) -# )) - - - -# # # Profile the fitting routine without logging -# # profvis::profvis({ -# # est.params <- gmlm_tensor_normal(X, F, max.iter = max.iter) -# # }) - -# with(hist, { -# plot(range(iter), range(hist[, -1]), type = "n", log = "y") -# lines(iter, resid, col = "red") -# lines(iter, dist.B, col = "green") -# lines(iter, dist.Omega, col = "blue") -# legend("topright", legend = c("resid", "dist.B", "dist.Omega"), col = c("red", "green", "blue"), lty = 1) -# }) - - -# # par(mfrow = c(1, 2)) - -# # matrixImage(Reduce(kronecker, rev(est.params$Omegas))) -# # matrixImage(Reduce(kronecker, rev(true.params$Omegas))) -# # # matrixImage(Reduce(kronecker, rev(est.params$Omegas)) - Reduce(kronecker, rev(true.params$Omegas))) - -# # par(mfrow = c(1, 2)) - -# # matrixImage(Reduce(kronecker, rev(true.params$betas)), main = "True", col = hcl.colors(48, palette = "Blue-Red 3")) -# # matrixImage(Reduce(kronecker, rev(est.params$betas)), main = "Est", col = hcl.colors(48, palette = "Blue-Red 3")) - - -# # # unlist(Map(kappa, mcov(X))) -# # # unlist(Map(kappa, Omegas)) -# # # unlist(Map(kappa, Sigmas)) - - -# # # max(unlist(Map(function(C) max(eigen(C)$values), mcov(X)))) -# # # min(unlist(Map(function(C) min(eigen(C)$values), mcov(X)))) -# # # Map(function(C) eigen(C)$values, mcov(X)) - -# # # kappa(Reduce(kronecker, mcov(X))) - -# # kappa1.fun <- function(Omegas) Map(kappa, Omegas) -# # kappa2.fun <- function(Omegas) Map(kappa, Omegas, exact = TRUE) -# # eigen1.fun <- function(Omegas) { -# # Map(function(Omega) { -# # min_max <- range(eigen(Omega)$values) -# # min_max[2] / min_max[1] -# # }, Omegas) -# # } -# # eigen2.fun <- function(Omegas) { -# # Map(function(Omega) { -# # min_max <- range(eigen(Omega, TRUE, TRUE)$values) -# # min_max[2] / min_max[1] -# # }, Omegas) -# # } -# # microbenchmark::microbenchmark( -# # kappa1.fun(Omegas), -# # kappa2.fun(Omegas), -# # eigen1.fun(Omegas), -# # eigen2.fun(Omegas) -# # ) diff --git a/tensorPredictors/R/matrixImage.R b/tensorPredictors/R/matrixImage.R index 1f53bc0..abe44b7 100644 --- a/tensorPredictors/R/matrixImage.R +++ b/tensorPredictors/R/matrixImage.R @@ -19,6 +19,7 @@ matrixImage <- function(A, add.values = FALSE, main = NULL, sub = NULL, interpolate = FALSE, ..., zlim = NA, axes = TRUE, asp = 1, col = hcl.colors(24, "Blue-Red 3", rev = FALSE), + col.values = par("col"), cex = 1, digits = getOption("digits"), new.plot = TRUE ) { # plot raster image @@ -56,6 +57,7 @@ matrixImage <- function(A, add.values = FALSE, A[!add.values] <- NA A[add.values] <- format(A[add.values], digits = digits) } - text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A, adj = 0.5) + text(rep(x - 0.5, each = nrow(A)), rep(rev(y - 0.5), ncol(A)), A, + adj = 0.5, cex = cex, col = col.values) } }