tensor_predictors/LaTeX/bernoulli.tex

\documentclass[a4paper, 10pt]{article}

\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{fullpage}
\usepackage{amsmath, amssymb, amstext, amsthm}
\usepackage{bm}                         % \boldsymbol and italic corrections, ...
\usepackage[pdftex]{hyperref}
\usepackage{makeidx}                    % Index (Symbols, Names, ...)
\usepackage{xcolor, graphicx}           % colors and including images
\usepackage{tikz}
\usetikzlibrary{calc}
\usepackage[
    % backend=bibtex,
    style=authoryear-comp
]{biblatex}
\usepackage{algorithm, algpseudocode}   % Pseudo Codes / Algorithms

% Document meta into
\title{Bernoulli}
\author{Daniel Kapla}
\date{\today}
% Set PDF title, author and creator.
\AtBeginDocument{
    \hypersetup{
        pdftitle = {Bernoulli},
        pdfauthor = {Daniel Kapla},
        pdfcreator = {\pdftexbanner}
    }
}

\makeindex

% Bibliography resource(s)
\addbibresource{main.bib}

% Setup environments
% Theorem, Lemma
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{example}{Example}
% Definition
\theoremstyle{definition}
\newtheorem{defn}{Definition}
% Remark
\theoremstyle{remark}
\newtheorem{remark}{Remark}

% Define math macros
\newcommand{\mat}[1]{\boldsymbol{#1}}
\newcommand{\ten}[1]{\mathcal{#1}}
\renewcommand{\vec}{\operatorname{vec}}
\newcommand{\dist}{\operatorname{dist}}
\newcommand{\rank}{\operatorname{rank}}
\DeclareMathOperator{\kron}{\otimes}    % Kronecker Product
\DeclareMathOperator{\hada}{\odot}      % Hadamard Product
\newcommand{\ttm}[1][n]{\times_{#1}}    % n-mode product (Tensor Times Matrix)
\DeclareMathOperator{\df}{df}
\DeclareMathOperator{\tr}{tr}
\DeclareMathOperator{\var}{Var}
\DeclareMathOperator{\cov}{Cov}
\DeclareMathOperator{\Span}{Span}
\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
% \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}}
\DeclareMathOperator*{\argmin}{{arg\,min}}
\DeclareMathOperator*{\argmax}{{arg\,max}}
\newcommand{\D}{\textnormal{D}}         % derivative
\renewcommand{\d}{\textnormal{d}}       % differential
\renewcommand{\t}[1]{{#1^{\prime}}}     % matrix transpose
\newcommand{\pinv}[1]{{#1^{\dagger}}}   % `Moore-Penrose pseudoinverse`
\renewcommand{\vec}{\operatorname{vec}}
\newcommand{\vech}{\operatorname{vech}}
\newcommand{\logical}[1]{{[\![#1]\!]}}

\begin{document}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Bivariate Bernoulli Distribution}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

A random 2-vector $X\in\{0, 1\}^2$ follows a \emph{Bivariate Bernoulli} distribution if its pmf is
\begin{displaymath}
    P(X = (x_1, x_2)) = p_{00}^{(1-x_1)(1-x_2)}p_{01}^{(1-x_1)x_2}p_{10}^{x_1(1-x_2)}p_{11}^{x_1x_2}
\end{displaymath}
where $p_{ab} = P(X = (a, b))$ for $a, b\in\{0, 1\}$. An alternative formulation, in terms of log-odds, follows immediately as
\begin{displaymath}
    P(X = (x_1, x_2)) = p_{00}\exp\Big(x_1\log\frac{p_{10}}{p_{00}} + x_2\log\frac{p_{01}}{p_{00}} + x_1x_2\log\frac{p_{00}p_{11}}{p_{01}p_{10}}\Big).
\end{displaymath}
Collecting the log-odds in a parameter vector $\mat{\theta} = \t{(\theta_{01}, \theta_{10}, \theta_{11})}$ where
\begin{align*}
    \theta_{01} &= \log\frac{p_{01}}{p_{00}}, \\
    \theta_{10} &= \log\frac{p_{10}}{p_{00}}, \\
    \theta_{11} &= \log\frac{p_{00}p_{11}}{p_{01}p_{10}}
\end{align*}
the pmf can be written more compact as
\begin{displaymath}
    P(X = (x_1, x_2)) = P(X = \mat{x}) = p_{00}\exp(\t{\mat{\theta}}\vech(\mat{x}\t{\mat{x}}))
        = p_{00}\exp(\t{\mat{x}}\mat{\Theta}\mat{x})
\end{displaymath}
with the parameter matrix $\mat{\Theta}$ defined as
\begin{displaymath}
    \mat{\Theta} = \begin{pmatrix}
        \theta_{01} & \tfrac{1}{2}\theta_{11} \\
        \tfrac{1}{2}\theta_{11} & \theta_{10}
    \end{pmatrix} = \begin{pmatrix}
        \log\frac{p_{01}}{p_{00}} & \tfrac{1}{2}\log\frac{p_{00}p_{11}}{p_{01}p_{10}} \\
        \tfrac{1}{2}\log\frac{p_{00}p_{11}}{p_{01}p_{10}} & \log\frac{p_{10}}{p_{00}}
    \end{pmatrix}.
\end{displaymath}
The marginal distribution of $X_1$ and $X_2$ are given by
\begin{align*}
    P(X_1 = x_1) &= P(X = (x_1, 0)) + P(X = (x_1, 1)) \\
        &= p_{00}^{1-x_1}p_{10}^{x_1} + p_{01}^{1-x_1}p_{11}^{x_1} \\
        &= \begin{cases}
            p_{00} + p_{01}, & x_1 = 0 \\
            p_{01} + p_{11}, & x_1 = 1
        \end{cases} \\
        &= (p_{00} + p_{01})^{1-x_1}(p_{01} + p_{11})^{x_1}. \\
    P(X_2 = x_2) &= (p_{00} + p_{10})^{1-x_2}(p_{10} + p_{11})^{x_2}.
\end{align*}
Furthermore, the conditional distributions are
\begin{align*}
    P(X_1 = x_1|X_2 = x_2) = \frac{P(X = (x_1, x_2))}{P(X_2 = x_2)}
        \propto \big(p_{00}^{1-x_2}p_{01}^{x_2}\big)^{1-x_1}\big(p_{10}^{1-x_2}p_{11}^{x_2}\big)^{x_1}, \\
    P(X_2 = x_2|X_1 = x_1) = \frac{P(X = (x_1, x_2))}{P(X_1 = x_1)}
        \propto \big(p_{00}^{1-x_1}p_{10}^{x_1}\big)^{1-x_2}\big(p_{01}^{1-x_1}p_{11}^{x_1}\big)^{x_2}.
\end{align*}
Note that both the marginal and the conditional are again Bernoulli distributed. Its also of interest to look at the covariance between the components of $X$ which are given by
\begin{displaymath}
    \cov(X_1, X_2) = \E[(X_1 - \E X_1)(X_2 - \E X_2)] = p_{00}p_{11} - p_{01}p_{10}
\end{displaymath}
which follows by direct computation.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Multivariate Bernoulli Distribution}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

This is a direct generalization of the Bivariate Bernoulli Distribution. Before we start a few notes on notation. Let $a, b$ be binary vectors, then $\logical{a = b} = 1$ if and only if $\forall i : a_i = b_i$ and zero otherwise. With that, let $Y\in\{0, 1\}^q$ be a $q$-dimensional \emph{Multivariate Bernoulli} random variable with pdf
\begin{equation}\label{eq:mvb_pmf}
    P(Y = y) = \prod_{a\in\{0, 1\}^q} p_a^{\logical{y = a}} = p_y.
\end{equation}
The parameters are $2^q$ parameters $p_a$ which are indexed by the event $a\in\{0, 1\}^q$. The ``indexing'' is done by identifying an event $a\in\{0, 1\}^q$ with the corresponding binary number $m$ the event represents. In more detail we equate an event $a\in\{0, 1\}^q$ with a number $m\in[0; 2^q - 1]$ as
\begin{equation}\label{eq:mvb_numbering}
    m = m(a) = \sum_{i = 1}^q 2^{q - i}a_i
\end{equation}
which is a one-to-one relation. For example, for $q = 3$ all events are numbered as in Table~\ref{tab:event-to-number}.
\begin{table}[!ht]
    \centering
    \begin{minipage}{0.8\textwidth}
        \centering
        \begin{tabular}{c|c}
            Event $a$ & Number $m(a)$ \\ \hline
            (0, 0, 0) & 0 \\
            (0, 0, 1) & 1 \\
            (0, 1, 0) & 2 \\
            (0, 1, 1) & 3 \\
            (1, 0, 0) & 4 \\
            (1, 0, 1) & 5 \\
            (1, 1, 0) & 6 \\
            (1, 1, 1) & 7
        \end{tabular}
        \caption{\label{tab:event-to-number}\small Event numbering relation for $q = 3$. The events $a$ are all the possible elements of $\{0, 1\}^3$ and the numbers $m$ range from $0$ to $2^3 - 1 = 7$.}
    \end{minipage}
\end{table}

\subsection{Exponential Family and Natural Parameters}
The Multivariate Bernoulli is a member of the exponential family. This can be seen by rewriting the pmf \eqref{eq:mvb_pmf} in terms of an exponential family. First, we take a look at $\logical{y = a}$ for two binary vectors $y, a$ which can be written as
\begin{align*}
     \logical{y = a}
        &= \prod_{i = 1}^q (y_i a_i + (1 - y_i)(1 - a_i))
         = \prod_{i = 1}^q (y_i (2 a_i - 1) + (1 - a_i)) \\
        &= \sum_{b\in\{0, 1\}^q}\prod_{i = 1}^q [y_i (2 a_i - 1)]^{b_i}(1 - a_i)^{1-b_i} \\
\intertext{where the last equality follows by multiplying it out similar to the binomial theorem. Note that the inner product is zero if there is at least one $i$ such that $a_i = 1$ but $b_i = 0$, cause then $(1 - a_i)^{1-b_i} = 0$ and $1$ in all other cases. Therefore, using $\logical{a \leq b}$ gives}
    ...
        &= \sum_{b\in\{0, 1\}^q}\logical{a\leq b}\prod_{i = 1}^q [y_i (2 a_i - 1)]^{b_i}
\intertext{Next, given $\logical{a \leq b}$ we get that $\prod_{i = 1}^q (2 a_i - 1)^{b_i} = (-1)^\logical{|b| \equiv_2 |a|}$ by counting the number of zeros in $a$ where at the same index $b$ is one. Cause $(2 a_i - 1)^{b_i} = -1$ for $a_i = 0$ and $b_i = 1$ and $1$ in every other case. This is encoded in $|b| \equiv_2 |a|$ as this is true if there are even number of $a_i = 0$ and $b_i = 1$ cases and false otherwise. This leads to the final version of the rewriting of $\logical{y = a}$ as
}
    ...
        &= \sum_{b\in\{0, 1\}^q}\logical{a\leq b}(-1)^\logical{|b|\equiv_2|a|}\prod_{i = 1}^q y_i^{b_i}.
\end{align*}
Now, taking the log of \eqref{eq:mvb_pmf} and substituting $\logical{y = a}$ gives
\begin{align*}
    \log P(Y = y)
        &= \sum_{a\in\{0, 1\}^q}\logical{y = a}\log p_a \\
        &= \sum_{b\in\{0, 1\}^q}\sum_{a\in\{0, 1\}^q}\log(p_a)\logical{a\leq b}(-1)^\logical{|b|\equiv_2|a|}\prod_{i = 1}^q y_i^{b_i} \\
        &= \sum_{b\in\{0, 1\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\sum_{a\leq b}\log(p_a)(-1)^\logical{|b|\equiv_2|a|} \\
        &= \sum_{b\in\{0, 1\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\log\frac{\prod_{a\leq b, |a|\equiv_2|b|}p_a}{\prod_{a\leq b, |a|\not\equiv_2|b|}p_a}
\end{align*}
For each $b\in\{0, 1\}^q$ except for $b = (0, ..., 0)$ define
\begin{displaymath}
    \theta_{m(b)} = \theta_b = \log\frac{\prod_{a\leq b, |a|\equiv_2|b|}p_a}{\prod_{a\leq b, |a|\not\equiv_2|b|}p_a}, \qquad b\in\{0, 1\}^q\backslash\{0\}^q
\end{displaymath}
and collect the thetas in a combined vetor $\mat{\theta} = (\theta_1, \theta_2, ..., \theta_{2^q - 1})$ where we used the Bernoulli event to number identification of \eqref{eq:mvb_numbering}. The zero event is excluded here as casue its not needed. The reason therefore is that its already determined by all the other parameters and will be incorporated as the pmf scaling factor in the exponential family representation. Using the newly defined $\mat{\theta}$ we get
\begin{align*}
    P(Y = y) &= \exp\log P(Y = y)
        = \exp\left(\sum_{b\in\{0, 1\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\log\frac{\prod_{a\leq b, |a|\equiv_2|b|}p_a}{\prod_{a\leq b, |a|\not\equiv_2|b|}p_a}\right) \\
        &= p_0\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)
\end{align*}
The final step is to determin an representation of $p_0$ is terms of $\mat{\theta}$. But this follows simply by the fact that probabilities sum to $1$.
\begin{align*}
    1 &= \sum_{y\in\{0, 1\}^q}P(Y = y) = p_0\left(1 + \sum_{y\in\{0, 1\}^q\backslash\{0\}^q}\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)\right), \\
    p_0(\mat{\theta}) &= \left(1 + \sum_{y\in\{0, 1\}^q\backslash\{0\}^q}\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)\right)^{-1}.
\end{align*}
This gives the pmf representation as
\begin{equation}\label{eq:mvb_exp_fam}
    P(Y = y) = p_0(\mat{\theta})\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)
        = p_0(\mat{\theta})\exp(\t{T(y)}\mat{\theta})
\end{equation}
which proves the fact that the Multivariate Bernoulli is a member of the exponential family. Furthermore, the statistic $T(y)$ in \eqref{eq:mvb_exp_fam} is
\begin{displaymath}
    T(y) = \left(\prod_{i = 1}^q y_i^{b_i}\right)_{b\in\{0, 1\}^q\backslash\{0\}^q}
\end{displaymath}
which is a $2^q - 1$ dimensional binary vector.

\subsection{Expectation and Covariance}
First the expectation of a Multivariate Bernoulli $Y\sim\mathcal{B}_p$ is given by
\begin{displaymath}
    (\E Y)_j = (\E Y_j) = \sum_{\substack{y\in\{0, 1\}^q}} P(Y = y)y_j = \sum_{\substack{y\in\{0, 1\}^q\\y_j = 1}}P(Y = y)
\end{displaymath}
for each of the $j = 1, ..., q$ components of the $q$-dimensional random vector $Y$. Its covariance is similar given by
\begin{displaymath}
    \cov(Y_i, Y_j)
        = \E(Y_i - \E Y_i)(Y_j - \E Y_j)
        = \E Y_i Y_j - (\E Y_i)(\E Y_j)
        = \sum_{\substack{y\in\{0, 1\}^q\\y_i = y_j = 1}}P(Y = y) - (\E Y_i)(\E Y_j).
\end{displaymath}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{The Ising Model}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The Ising model is a special case of the Mutlivariate Bernoulli with pmf defined directly in its exponential family form by
\begin{displaymath}
    P_{\mat{\theta}}(Y = \mat{y}) = p_0(\mat{\theta})\exp\left(\sum_{i = 1}^q \theta_{\iota(i, i)}y_i + \sum_{i = 1}^{q - 1} \sum_{j = i + 1}^q \theta_{\iota(i, j)}y_i y_j\right).
\end{displaymath}
This ``constraint'' model only considures two way interactions with the natural parameters $\mat{\theta}$ of size $q(q + 1)/2$. The indexing function $\iota$ maps the vector indices to the corresponding parameter indices
\begin{align*}
    \iota(i, j) &= \iota_0(\min(i, j) - 1, \max(i, j) - 1) + 1, & i, j &= 1, ..., q \\
\intertext{with the $0$-indexed mapping}
    \iota_0(i, j) &= \frac{i (2 q + 1 - i)}{2} + (j - i) & i, j &= 0, ..., q - 1\text{ and }i\leq j.
\end{align*}
This index mapping is constructed such that the half vectorization of the outer product $\mat{y}\t{\mat{y}}$ corresponds in its singe and two way interactions between components to the appropriate parameter indices in $\mat{\theta}$. In other words, above pmf can be written as
\begin{displaymath}
    P_{\mat{\theta}}(Y = \mat{y}) = p_0(\mat{\theta})\exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} ).
\end{displaymath}
The scaling factor $p_0(\mat{\theta})$ (which is also the probability of the zero event, therefore the name) is
\begin{displaymath}
    p_0(\mat{\theta}) = \Big( \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} ) \Big)^{-1}.
\end{displaymath}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Conditional Distribution}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
{\color{red} TODO: Fix this, its wrong!!!

For the conditional distribution under the Ising model let $I\subsetneq[q]$ be a non-empty index set (non-empty cause this corresponds to no conditioning and not equal to avoid $P_{\mat{\theta}}(\emptyset|Y = \mat{y})$). Then denote with $\mat{y}_I$ the $|I|$ sub-vector of $\mat{y}$ consisting only of the indices in $I$ while $\mat{y}_{-I}$ is the $q - |I|$ vector with all indices \emph{not} in $I$.
\begin{displaymath}
    P_{\mat{\theta}}(Y_{I} = \mat{y}_{I} | Y_{-I} = \mat{y}_{-I})
        = \frac{P_{\mat{\theta}}(Y = \mat{y})}{P_{\mat{\theta}}(Y_{-I} = \mat{y}_{-I})}
        \propto (\mat{a}\mapsto P_{\mat{\theta}}(Y_{I} = \mat{a}, Y_{-I} = \mat{y}_{-I}))|_{\mat{a} = \mat{y}_I}
\end{displaymath}
now noting that
\begin{displaymath}
    \vech(\mat{y}\t{\mat{y}}) = \vech\Big(\mat{y}\t{\big(\mat{y} - \sum_{i\in I}y_i\mat{e}_i\big)}\Big)
        + \vech\Big(\mat{y}\t{\big(\sum_{i\in I}y_i\mat{e}_i\big)}\Big)
\end{displaymath}
with the left summand depending only on $\mat{y}_{-I}$ due to the half vectorization only considureing the lower triangular part and the main diagonal. By substitution into the conditional probability we get
\begin{align*}
    P_{\mat{\theta}}(Y_{I} = \mat{y}_{I} | Y_{-I} = \mat{y}_{-I})
        &\propto \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} ) \\
        &= \exp\Big( \t{\vech\Big(\mat{y}\t{\big(\mat{y} - \sum_{i\in I}y_i\mat{e}_i\big)}\Big)}\mat{\theta} \Big)
            \exp\Big( \t{\vech\Big(\mat{y}\t{\big(\sum_{i\in I}y_i\mat{e}_i\big)}\Big)}\mat{\theta} \Big) \\
        &\propto \exp\Big( \t{\vech\Big(\mat{y}\t{\big(\sum_{i\in I}y_i\mat{e}_i\big)}\Big)}\mat{\theta} \Big) \\
        &= \prod_{i\in I}\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}
\end{align*}
leading to the scaled form
\begin{displaymath}
    P_{\mat{\theta}}(Y_{I} = \mat{y}_{I} | Y_{-I} = \mat{y}_{-I})
        = p_0(\mat{\theta} | Y_{-I} = \mat{y}_{-I})\prod_{i\in I}\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}
\end{displaymath}
with
\begin{displaymath}
    p_0(\mat{\theta} | Y_{-I} = \mat{y}_{-I})
        = P_{\mat{\theta}}(Y_{I} = \mat{0} | Y_{-I} = \mat{y}_{-I})
        = \Big(\sum_{\substack{\mat{a}\in\{0, 1\}^q\\\mat{a}_{-I} = \mat{y}_{-I}}}
        \prod_{i\in I}\exp( \t{\vech(\mat{a}\t{\mat{e}_i})}\mat{\theta} )^{a_i}\Big)^{-1}.
\end{displaymath}

} % end of TODO: Fix this, its wrong!!!

Two special cases are of interest, first the single component case with $I = \{i\}$
\begin{displaymath}
    P_{\mat{\theta}}(Y_{i} = {y}_{i} | Y_{-i} = \mat{y}_{-i})
        = \frac{\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}}{1 + \exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )}
\end{displaymath}
and with $\mat{y}_{-i} = \mat{0}$ we get
\begin{displaymath}
    P_{\mat{\theta}}(Y_{i} = {y}_{i} | Y_{-i} = \mat{0})
    = \frac{\exp( {\theta}_{\iota(i)} )^{y_i}}{1 + \exp( {\theta}_{\iota(i)} )}
\end{displaymath}
leading to
\begin{displaymath}
    \theta_{\iota(i)} = \log\frac{P_{\mat{\theta}}(Y_{i} = 1 | Y_{-i} = \mat{0})}{P_{\mat{\theta}}(Y_{i} = 0 | Y_{-i} = \mat{0})} = \log\frac{P_{\mat{\theta}}(Y_{i} = 1 | Y_{-i} = \mat{0})}{1 - P_{\mat{\theta}}(Y_{i} = 1 | Y_{-i} = \mat{0})}
\end{displaymath}

The second case considures the conditional distribution of two components given all the rest is zero, meaning that $I = \{i, j\}$ and $\mat{y}_{-i,-j} = \mat{0}$ which has the form
\begin{align*}
    P_{\mat{\theta}}(Y_{i} = {y}_{i}, Y_{j} = {y}_{j} | Y_{-i,-j} = \mat{0})
        &= p_0(\mat{\theta} | Y_{-i,-j} = \mat{0})\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}
            \exp( \t{\vech(\mat{y}\t{\mat{e}_j})}\mat{\theta} )^{y_j} \\
        &= p_0(\mat{\theta} | Y_{-i,-j} = \mat{0})\exp(y_i\theta_{\iota(i)} + y_j\theta_{\iota(j)} + y_iy_j\theta_{\iota(i,j)})
\end{align*}
By setting the combinations of $y_i, y_j\in\{0, 1\}$ we get that
\begin{align*}
    \theta_{\iota(i, j)}
        &= \log\frac{P_{\mat{\theta}}(Y_{i} = 0, Y_{j} = 0 | Y_{-i,-j} = \mat{0})P_{\mat{\theta}}(Y_{i} = 1, Y_{j} = 1 | Y_{-i,-j} = \mat{0})}{P_{\mat{\theta}}(Y_{i} = 0, Y_{j} = 1 | Y_{-i,-j} = \mat{0})P_{\mat{\theta}}(Y_{i} = 1, Y_{j} = 0 | Y_{-i,-j} = \mat{0})} \\
        &= \log\frac{P_{\mat{\theta}}(Y_i = Y_j = 1 | Y_{-i,-j} = \mat{0})}{(1 - P_{\mat{\theta}}(Y_i = Y_j = 1 | Y_{-i,-j} = \mat{0}))}\frac{(1 - P_{\mat{\theta}}(Y_i = 1 | Y_{-i} = \mat{0})P_{\mat{\theta}}(Y_j = 1 | Y_{-j} = \mat{0}))}{P_{\mat{\theta}}(Y_i = 1 | Y_{-i} = \mat{0})P_{\mat{\theta}}(Y_j = 1 | Y_{-j} = \mat{0})}.
\end{align*}
Note that we have expressed all of the natural parameters $\mat{\theta}$ in terms conditional probabilities. Ether one component is $1$ and the rest is conditioned to be zero of two components are $1$ and the rest is conditional zero. This means that the set of those conditional probabilities is a sufficient statistic. Lets denote the vector of those as $\mat{\pi}$ with the same index mapping as used in $\theta$, more precise the the $q(q + 1) / 2$ dimensional vector $\mat{\pi}$ be defined component wise as
\begin{align*}
    {\pi}_{\iota(i)} = {\pi}(\mat{\theta})_{\iota(i)} &= P_{\mat{\theta}}(Y_i = 1 | Y_{-i} = \mat{0}) = \frac{\exp({\theta}_{\iota(i)})}{1 + \exp({\theta}_{\iota(i)})} \\
    {\pi}_{\iota(i, j)} = {\pi}(\mat{\theta})_{\iota(i, j)} &= P_{\mat{\theta}}(Y_i = Y_j = 1 | Y_{-i, -j} = \mat{0}) \\
        &= \frac{\exp(\theta_{\iota(i)} + \theta_{\iota(j)} + \theta_{\iota(i, j)})}{1 + \exp(\theta_{\iota(i)}) + \exp(\theta_{\iota(j)}) + \exp(\theta_{\iota(i)} + \theta_{\iota(j)} + \theta_{\iota(i, j)})}
\end{align*}
and the component wise inverse relation is given by
\begin{equation}
    \begin{aligned}[c]
        \theta_{\iota(i)} = \theta(\mat{\pi})_{\iota(i)}
            &= \log\frac{\pi_{\iota(i)}}{1 - \pi_{\iota(i)}} \\
        \theta_{\iota(i, j)} = \theta(\mat{\pi})_{\iota(i, j)}
            &= \log\frac{(1 - \pi_{\iota(i)}\pi_{\iota(j)})\pi_{\iota(i, j)}}{\pi_{\iota(i)}\pi_{\iota(j)}(1 - \pi_{\iota(i, j)})}.
    \end{aligned}\label{eq:ising_theta_from_cond_prob}
\end{equation}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Log-Likelihood, Score and Fisher Information}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The log-likelihood for a given dataset $\mat{y}_i$, $n = 1, ..., n$ is
\begin{displaymath}
    l(\mat{\theta})
        = \log\prod_{i = 1}^n P_{\mat{\theta}}(Y = \mat{y}_i)
        = \log\prod_{i = 1}^n p_0(\mat{\theta})\exp( \t{\vech(\mat{y}_i\t{\mat{y}_i})}\mat{\theta} )
        = n\log p_0(\mat{\theta}) + \sum_{i = 1}^n \t{\vech(\mat{y}_i\t{\mat{y}_i})}\mat{\theta}.
\end{displaymath}
For computing the Score we first get the differential
\begin{align*}
    \d l(\mat{\theta})
        &= n p_0(\mat{\theta})^{-1} \d p_0(\mat{\theta}) + \sum_{i = 1}^n \t{\vech(\mat{y}_i\t{\mat{y}_i})}\d\mat{\theta} \\
        &= -n p_0(\mat{\theta}) \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\t{\vech(\mat{y}\t{\mat{y}})}\d\mat{\theta} + \sum_{i = 1}^n \t{\vech(\mat{y}_i\t{\mat{y}_i})}\d\mat{\theta}
\end{align*}
leading to the Score
\begin{align}
    \nabla_{\mat{\theta}} l
        &=  -n p_0(\mat{\theta}) \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\vech(\mat{y}\t{\mat{y}}) + \sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}) \nonumber \\
        &= -n \E_{\mat{\theta}} \vech(Y\t{Y}) + \sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}). \label{eq:ising_score}
\end{align}
The second differential of the log-likelihood is
\begin{multline*}
    \d^2 l(\mat{\theta})
        = n p_0(\mat{\theta})^2 \d\t{\mat{\theta}} \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\vech(\mat{y}\t{\mat{y}}) \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\t{\vech(\mat{y}\t{\mat{y}})}\d\mat{\theta} \\
            - n p_0(\mat{\theta}) \sum_{y\in\{0, 1\}^q} \d\t{\mat{\theta}}\exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\vech(\mat{y}\t{\mat{y}})\t{\vech(\mat{y}\t{\mat{y}})}\d\mat{\theta} + 0
\end{multline*}
leading to the Hessian
\begin{align*}
    \nabla^2_{\mat{\theta}} l
        &= n (\E_{\mat{\theta}}\vech(Y\t{Y}))\t{(\E_{\mat{\theta}}\vech(Y\t{Y}))} - n \E_{\mat{\theta}}\vech(Y\t{Y})\t{\vech(Y\t{Y})} \\
        &= -n \cov_{\mat{\theta}}(\vech(Y\t{Y}), \vech(Y\t{Y})).
\end{align*}
From this the Fisher Information is directly given as
\begin{displaymath}
    \mathcal{I}(\mat{\theta}) = -\E_{\mat{\theta}} \nabla^2_{\mat{\theta}} l
        = n \cov_{\mat{\theta}}(\vech(Y\t{Y}), \vech(Y\t{Y})).
\end{displaymath}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Estimation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
For the estimation we use the Fisher scoring algorithm which is an iterative updating algorithm following the updating rule
\begin{displaymath}
    \mat{\theta}_{t+1} = \mat{\theta}_{t} + \mathcal{I}(\mat{\theta})^{-1}\nabla_{\mat{\theta}} l(\mat{\theta}).
\end{displaymath}
The base idea behing Fisher scoring is to considure a Taylor expantion of the score
\begin{displaymath}
    \nabla_{\mat{\theta}} l(\mat{\theta}^*) \approx \nabla_{\mat{\theta}}l(\mat{\theta}) + \nabla^2_{\mat{\theta}} l(\mat{\theta})(\mat{\theta}^* - \mat{\theta}).
\end{displaymath}
Setting $\mat{\theta}^*$ to the MLE estimate $\widehat{\mat{\theta}}$ we get with $\nabla_{\mat{\theta}} l(\widehat{\mat{\theta}}) = 0$ that
\begin{align*}
    0 &\approx \nabla_{\mat{\theta}}l(\mat{\theta}) + \nabla^2_{\mat{\theta}} l(\mat{\theta})(\widehat{\mat{\theta}} - \mat{\theta}) \\
    \widehat{\mat{\theta}} &\approx \mat{\theta} - (\nabla^2_{\mat{\theta}} l(\mat{\theta}))^{-1}\nabla_{\mat{\theta}}l(\mat{\theta}).
\end{align*}
Now, replacing the observed information $\nabla^2_{\mat{\theta}} l(\mat{\theta})$ with the Fisher information $\mathcal{I}(\mat{\theta}) = -\E_{\mat{\theta}} \nabla^2_{\mat{\theta}} l(\mat{\theta})$ leads to
\begin{displaymath}
    \widehat{\mat{\theta}} \approx \mat{\theta} + \mathcal{I}(\mat{\theta})^{-1}\nabla_{\mat{\theta}}l(\mat{\theta})
\end{displaymath}
which is basically above updating rule.

For an initial estimate $\mat{\theta}_0$ we can evaluate the Score \eqref{eq:ising_score} at the MLE estimate $\widehat{\mat{\theta}}$ leading to
\begin{displaymath}
    \E_{\widehat{\mat{\theta}}} \vech(Y\t{Y}) = \frac{1}{n}\sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}).
\end{displaymath}
With $\E_{\widehat{\mat{\theta}}} \vech(Y\t{Y})_{\iota(i, j)} = P_{\widehat{\mat{\theta}}}(Y_iY_j = 1)$ we may treat the marginal probabilites to be not that far of the conditional probabilities and set $\mat{\pi}_0 = \E_{\widehat{\mat{\theta}}} \vech(Y\t{Y})$ from which we can compute $\mat{\theta}_0 = \mat{\theta}(\mat{\pi}_0)$ as in \eqref{eq:ising_theta_from_cond_prob}.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{The Ising Model with Covariates}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Now assume additional covariates $X\in\mathbb{R}^p$ given to the Multivariate Bernoulli variable $Y\in\{0, 1\}^q$ under the Ising model. There relation is given by assuming that $Y$ follows the Ising model given $X$
\begin{align*}
    P_{\mat{\theta}_X}(Y = \mat{y} | X)
        &= p_0(\mat{\theta}_X)\exp(\t{\vech(\mat{y} \t{\mat{y}})}\mat{\theta}_X) \\
        &= p_0(\mat{\theta}_X)\exp(\t{\mat{y}}\mat{\Theta}_X\mat{y}).
\end{align*}
with $\mat{\Theta}$ beeing a $q\times q$ symmetric matrix such that $\t{\mat{y}}\mat{\Theta}\mat{y} = \t{\vech(\mat{y} \t{\mat{y}})}\mat{\theta}$ for any $\mat{y}$. The explicit relation is
\begin{align*}
    \mat{\Theta} &= \tfrac{1}{2}(\mat{1}_q\t{\mat{1}_q} + \mat{I}_q)\odot\vech^{-1}(\mat{\theta}), \\
    \mat{\theta} &= \vech((2\mat{1}_q\t{\mat{1}_q} - \mat{I}_q)\odot\mat{\Theta}).
\end{align*}
Assuming for centered $\E X = 0$ (w.l.o.g. cause we can always replace $X$ with $X - \E X$) that the covariate dependent parameters $\mat{\Theta}_X$ relate to $X$ by
\begin{displaymath}
    \mat{\Theta}_X = \t{\mat{\alpha}}X\t{X}\mat{\alpha}
\end{displaymath}
for an unconstraint parameter matrix $\mat{\alpha}$ of dimensions $p\times q$ leads to the Ising model with covariates of the form
\begin{displaymath}
    P_{\mat{\alpha}}(Y = \mat{y} | X)
        = p_0(\mat{\alpha}, X)\exp(\t{\mat{y}}\t{\mat{\alpha}}X\t{X}\mat{\alpha}\mat{y}).
\end{displaymath}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Log-likelihood, Score and Fisher information}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Give a single observation $(\mat{y}, \mat{x})$ the log-likelihood is
\begin{displaymath}
    l(\mat{\alpha}) = \log P_{\mat{\alpha}}(Y = \mat{y} \mid X = \mat{x})
        = \log p_0(\mat{\alpha}, \mat{x}) + \t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y}.
\end{displaymath}
Before we write the differential of the log-likelihood we take a look at the following
\begin{align*}
    \d(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})
       &= \d\tr(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y}) \\
       &= 2 \tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \\
       &= 2 \t{\vec(\mat{\alpha})}(\mat{y}\t{\mat{y}}\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha}) \\
    \d\log p_0(\mat{\alpha}, \mat{x})
       &= -p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\d\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y}) \\
       &= -2p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \\
       &= -2 \tr(\E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}]\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \\
       &= -2 \t{\vec(\mat{\alpha})}(\E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}]\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha})
\end{align*}
Therefore, the differential of the log-likelihood is
\begin{displaymath}
    \d l(\mat{\alpha})
        = 2 \tr((\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}])\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha})
\end{displaymath}
or equivalently the Score
\begin{align}
    \nabla_{\mat{\alpha}}l
        &= 2 \mat{x}\t{\mat{x}}\mat{\alpha}(\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}]) \nonumber \\
        &= 2 \mat{x}\t{\mat{x}}\mat{\alpha}\vec^{-1}(\mat{D}_q(\vech(\mat{y}\t{\mat{y}}) - \E_{\mat{\theta}_{\mat{\alpha}}(\mat{x})}[\vech(Y\t{Y})\mid X=\mat{x}])) \label{eq:ising_cond_score} \\
        &= 2 \mat{x}\t{\mat{x}}\mat{\alpha}\vec^{-1}(\mat{D}_q\nabla_{\mat{\theta}}l(\mat{\theta}_{\mat{\alpha}}(\mat{x}); \mat{y})) \nonumber
\end{align}
where $\nabla_{\mat{\theta}}l(\mat{\theta}_{\mat{\alpha}}(\mat{x}); \mat{y})$ is the Score \eqref{eq:ising_score} for a single observation and $\mat{D}_q$ is the dublication matrix.

Now we continue with the second-order differential of the log-likelihood, therefore considure
\begin{align*}
    \d^2(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})
        &= 2 \tr(\mat{y}\t{\mat{y}}\t{(\d\mat{\alpha})}\mat{x}\t{\mat{x}}\d\mat{\alpha}) + 2\tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d^2\mat{\alpha}) \\
        &= 2\t{\vec(\d\mat{\alpha})}(\mat{y}\t{\mat{y}}\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha}) + 0.
\end{align*}
The next term is
\begin{align*}
    \d^2 \log p_0(\mat{\alpha}, \mat{x})
        &= \d\Big( -2p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \Big) \\
\intertext{To shorten the expressions let $A_{\mat{y}} = (\mat{y}\t{\mat{y}}\otimes \mat{x}\t{\mat{x}})\vec{\mat{\alpha}}$, then}
    \ldots &= \d\Big( -2p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{A_{\mat{y}}}\vec(\d\mat{\alpha}) \Big) \\
        &= -2(\d p_0(\mat{\alpha}, \mat{x}))\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{A_{\mat{y}}}\vec(\d\mat{\alpha}) \Big) \\
        &\qquad -4 p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{\vec(\d\mat{\alpha})}A_{\mat{y}}\t{A_{\mat{y}}}\vec(\d\mat{\alpha}) \Big) \\
        &\qquad\qquad -2 p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{\vec(\d\mat{\alpha})}(\mat{y}\t{\mat{y}}\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha}) \Big) \\
        &= 4\t{\vec(\d\mat{\alpha})} \E_{\mat{\alpha}}[A_{Y} \mid X = \mat{x}]\E_{\mat{\alpha}}[\t{A_{Y}} \mid X = \mat{x}] \vec(\d\mat{\alpha}) \\
        &\qquad -4\t{\vec(\d\mat{\alpha})} \E_{\mat{\alpha}}[A_{Y}\t{A_{Y}} \mid X = \mat{x}] \vec(\d\mat{\alpha}) \\
        &\qquad\qquad -2\t{\vec(\d\mat{\alpha})} \E_{\mat{\alpha}}[Y\t{Y}\otimes \mat{x}\t{\mat{x}} \mid X = \mat{x}] \vec(\d\mat{\alpha}) \\
        &= -\t{\vec(\d\mat{\alpha})}( 4\cov_{\mat{\alpha}}(A_{Y} \mid X = \mat{x})
            + 2 \E_{\mat{\alpha}}[Y\t{Y}\otimes \mat{x}\t{\mat{x}} \mid X = \mat{x}]) \vec(\d\mat{\alpha})
\end{align*}
Back substituting to get the second order differential of the log-likelihood yields
\begin{displaymath}
    \d^2 l(\mat{\alpha}) = \t{\vec(\d\mat{\alpha})}(
        2(\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y} \mid X = \mat{x}])\otimes \mat{x}\t{\mat{x}}
        - 4\cov_{\mat{\alpha}}(A_{Y} \mid X = \mat{x})
    ) \vec(\d\mat{\alpha})
\end{displaymath}
leading to the Hessian of the log-likelihood
\begin{displaymath}
    \nabla^2_{\vec{\mat{\alpha}}} l
        = 2(\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y} \mid X = \mat{x}])\otimes \mat{x}\t{\mat{x}}
        - 4\cov_{\mat{\alpha}}((Y\t{Y}\otimes \mat{x}\t{\mat{x}})\vec{\mat{\alpha}} \mid X = \mat{x}).
\end{displaymath}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Estimation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
{\color{red}This needs to be figured out how to do this in a good way.}

For the initial value we start by equating the Score \eqref{eq:ising_cond_score} to zero giving us
\begin{displaymath}
    \widehat{\E}_{\mat{\theta}_{\mat{\alpha}}(\mat{x})}[\vech(Y\t{Y})\mid X=\mat{x}] = \frac{1}{n}\sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i})
\end{displaymath}
as an estimate of the marginal probabilities of the singe and two way interaction effects $\E_{\mat{\theta}_{\mat{\alpha}}(\mat{x})}[\vech(Y\t{Y})\mid X=\mat{x}]$. By assuming that the marginal probabilities are similar to the conditional probabilities $\mat{\pi}$ we simply equate them to get an initial estimate for the conditional probabilities
\begin{displaymath}
    \widehat{\mat{\pi}}_0 = \frac{1}{n}\sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}).
\end{displaymath}
Using the relation \eqref{eq:ising_theta_from_cond_prob} we compute an initial estimate for the natural parameters
\begin{displaymath}
    \widehat{\mat{\theta}}_0 = \widehat{\mat{\theta}}(\widehat{\mat{\pi}}_0)
\end{displaymath}
and convert it to the matrix version of the parameters
\begin{displaymath}
    \widehat{\mat{\Theta}}_0 = \tfrac{1}{2}(\mat{1}_q\t{\mat{1}_q} + \mat{I}_q)\odot \vech^{-1}(\widehat{\mat{\theta}}_0).
\end{displaymath}
Let $\widehat{\mat{\Sigma}} = \frac{1}{n}\sum_{i = 1}^n \mat{x}_i\t{\mat{x}_i}$ then we get
\begin{displaymath}
    \widehat{\mat{\Theta}}_0 = \t{\widehat{\mat{\alpha}}_0}\widehat{\mat{\Sigma}}\widehat{\mat{\alpha}}_0.
\end{displaymath}
Next we define $m = \min(p, q)$ and take an rank $m$ approximation of both $\widehat{\mat{\Theta}}_0$ and $\widehat{\mat{\Sigma}}_0$ via an SVD. These approximations have the form
\begin{align*}
    \widehat{\mat{\Theta}}_0 &\approx \mat{U}_{\widehat{\mat{\Theta}}_0} \mat{D}_{\widehat{\mat{\Theta}}_0} \t{\mat{U}_{\widehat{\mat{\Theta}}_0}} \\
    \widehat{\mat{\Sigma}}_0 &\approx \mat{U}_{\widehat{\mat{\Sigma}}_0} \mat{D}_{\widehat{\mat{\Sigma}}_0} \t{\mat{U}_{\widehat{\mat{\Sigma}}_0}}
\end{align*}
where $\mat{U}_{\widehat{\mat{\Theta}}_0}$, $\mat{U}_{\widehat{\mat{\Sigma}}_0}$ are semi-orthogonal matrices of dimensions $q\times m$, $p\times m$, respectively. Both the diagonal matrices $\mat{D}_{\widehat{\mat{\Theta}}_0}$, $\mat{D}_{\widehat{\mat{\Theta}}_0}$ have dimensions $m\times m$. Substitution of the approximations into above $\widehat{\mat{\Theta}}_0$ to $\widehat{\mat{\alpha}}_0$ relation yields
\begin{displaymath}
    \mat{U}_{\widehat{\mat{\Theta}}_0} \mat{D}_{\widehat{\mat{\Theta}}_0} \t{\mat{U}_{\widehat{\mat{\Theta}}_0}} \approx \t{\widehat{\mat{\alpha}}_0}\mat{U}_{\widehat{\mat{\Sigma}}_0} \mat{D}_{\widehat{\mat{\Sigma}}_0} \t{\mat{U}_{\widehat{\mat{\Sigma}}_0}}\widehat{\mat{\alpha}}_0.
\end{displaymath}
Solving for $\widehat{\mat{\alpha}}_0$ leads to out initial value estimate
\begin{displaymath}
    \widehat{\mat{\alpha}}_0 = \mat{U}_{\widehat{\mat{\Sigma}}_0} \mat{D}_{\widehat{\mat{\Sigma}}_0}^{-1/2}\mat{D}_{\widehat{\mat{\Theta}}_0}^{1/2} \t{\mat{U}_{\widehat{\mat{\Theta}}_0}}.
\end{displaymath}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
{\color{red}Some notes}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
With the conditional Fisher information $\mathcal{I}_{Y\mid X = \mat{x}}$ as
\begin{displaymath}
    \mathcal{I}_{Y\mid X = \mat{x}}(\vec\mat{\alpha})
        = -\E_{\mat{\alpha}}[\nabla^2_{\vec{\mat{\alpha}}} l \mid X = \mat{x}]
        = 4\cov_{\mat{\alpha}}((Y\t{Y}\otimes \mat{x}\t{\mat{x}})\vec{\mat{\alpha}} \mid X = \mat{x}).
\end{displaymath}
If we know that $X\sim f_X$ with a pdf (or pmf) $f_X$ for $X$ we get
\begin{displaymath}
    \mathcal{I}_{Y|X}(\vec\mat{\alpha}) = \int\mathcal{I}_{Y\mid X = \mat{x}}(\vec\mat{\alpha})f_X(\mat{x})\,\d\mat{x}
        = 4\E_{X}\cov_{\mat{\alpha}}((Y\t{Y}\otimes X\t{X})\vec{\mat{\alpha}} \mid X)
\end{displaymath}
by
\begin{displaymath}
    f_{\theta}(Y, X) = f_{\theta}(Y\mid X)f_{\theta}(X)
\end{displaymath}
it follows that
\begin{displaymath}
    l_{X, Y}(\theta) = \log f_{\theta}(Y, X)
        = \log f_{\theta}(Y\mid X) + \log f_{\theta}(X)
        = l_{Y\mid X}(\theta) + l_{X}(\theta)
\end{displaymath}
% With the fisher information in general beeing defined (under any distribution) $\mathcal{I}(\theta) = \E_{\theta} \nabla l(\theta)$ with the classical Fisher information under the joint distribution of $X, Y$ to be
% \begin{align*}
%     \mathcal{I}_{X, Y}(\theta)
%         &= \E_{X, Y}\nabla l_{X, Y}(\theta) \\
%         &= \E_{X, Y}\nabla l_{Y\mid X}(\theta) + \E_{X, Y}\nabla l_{X}(\theta) \\
%         &= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] + \E_{X, Y}\nabla l_{X}(\theta) \\
%         &= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] + \E_{X}\E_{Y\mid X}[\nabla l_{X}(\theta)\mid X] \\
%         &= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] + \E_{X}\nabla l_{X}(\theta) \\
%         &= \mathcal{I}_{Y\mid X}(\theta) + \mathcal{I}_{X}(\theta)
% \end{align*}

% What happens if the know the conditional distribution $Y\mid X\sim f_{\theta}$ only, meaning we not know the distribution of $X$. Then the log-likelihood for a data set $(y_i, x_i)$ with $i = 1, ..., n$ observations has the form
% \begin{displaymath}
%     l_{Y\mid X}(\theta) = \log\prod_{i = 1}^n f_{\theta}(Y = y_i \mid X = x_i)
%         = \sum_{i = 1}^n \log f_{\theta}(Y = y_i \mid X = x_i)
% \end{displaymath}
% leading to
% \begin{displaymath}
%     \nabla l_{Y\mid X}(\theta) = \sum_{i = 1}^n \nabla \log f_{\theta}(Y = y_i \mid X = x_i)
% \end{displaymath}

\appendix
\section{Notes on Fisher Information}
Let $X\sim f_{\theta}$ be a random variable following a parameterized pdf (of pmf) $f_{\theta}$ with parameter vector $\theta$. Its log-likelihood (on the population, it is itself a random variable) is then
\begin{displaymath}
    l(\theta) = \log f_{\theta}(X)
\end{displaymath}
and the Score is defined as the derivative of the log-likelihood
\begin{displaymath}
    \nabla l(\theta) = \nabla\log f_{\theta}(X).
\end{displaymath}
The expectation of the Score is
\begin{displaymath}
    \E \nabla l(\theta)
        = \int \nabla f_{\theta}(x)\log f_{\theta}(x)\,\d x
        = \int \nabla f_{\theta}(x)\,\d x
        = \nabla \int f_{\theta}(x)\,\d x
        = \nabla 1
        = 0.
\end{displaymath}
The Fisher information is defined as follows which is identical to the covariance of the Score due to the zero expectation of the Score
\begin{displaymath}
    \mathcal{I}(\theta) = \E \nabla l(\theta)\t{\nabla l(\theta)}.
\end{displaymath}

Now assume we have two random variable $X, Y$ and a parameter vector $\theta$, then the joint distributed relates to the conditional and the marginal distribution by
\begin{displaymath}
    f_{\theta}(X, Y) = f_{\theta}(Y\mid X)f_{\theta}(X)
\end{displaymath}
leading to the log-likelihood
\begin{displaymath}
    l_{X, Y}(\theta) = \log f_{\theta}(X, Y) = \log f_{\theta}(Y\mid X) + \log f_{\theta}(X)
        = l_{Y\mid X}(\theta) + l_{X}(\theta).
\end{displaymath}
The Score relates identical due to the linearity of differentiation
\begin{displaymath}
    \nabla l_{X, Y}(\theta)
        = \nabla l_{Y\mid X}(\theta) + \nabla l_{X}(\theta)
\end{displaymath}
but for the Fisher Information its (due to a different argument) the same cause
\begin{align*}
    \mathcal{I}_{X, Y}(\theta)
        &= \E_{X, Y}\nabla l_{X, Y}(\theta)\t{\nabla l_{X, Y}(\theta)} \\
        &= \E_{X, Y}(\nabla l_{Y\mid X}(\theta) + \nabla l_{X}(\theta))\t{(\nabla l_{Y\mid X}(\theta) + \nabla l_{X}(\theta))} \\
        &= \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
            + \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{X}(\theta)} \\
        &\qquad + \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
                    + \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)} \\
        &= \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
            + \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)}
\end{align*}
where the last equality is due to
\begin{displaymath}
    \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{X}(\theta)}
        = \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X]\t{\nabla l_{X}(\theta)}
        = 0
\end{displaymath}
using the $\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] = 0$ as the expectation of the Score. The second term is identical and therefore we get
\begin{align*}
    \mathcal{I}_{X, Y}(\theta)
        &= \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
            + \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)} \\
        &= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)} \mid X]
            + \E_{X}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)} \\
        &= \mathcal{I}_{Y\mid X}(\theta) + \mathcal{I}_{X}(\theta).
\end{align*}
Note the conditional Fisher Information which has the form
\begin{displaymath}
    \mathcal{I}_{Y\mid X}(\theta)
        = \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)} \mid X]
        = \int \mathcal{I}_{Y\mid X = x}(\theta)f_X(x)\,\d x
\end{displaymath}
Furthermore, in the case that the distribution of $X$ does not depend on $\theta$, meaning $f_{\theta}(X) = f(X)$, then $\mathcal{I}_X(\theta) = 0$ and $\mathcal{I}_{X, Y}(\theta) = \mathcal{I}_{Y \mid X}(\theta)$.

\end{document}