CVE/LaTeX/notes.tex

\documentclass[12pt,a4paper]{article}

\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{amsmath, amsfonts, amssymb, amsthm}
\usepackage{tikz}
\usepackage{listings}
\usepackage{fullpage}


\lstdefinelanguage{PseudoCode} {
	morekeywords={
		for,
        while,
        repeat,
		from,
		each,
		foreach,
		break,
		continue,
		in,
		do,
		as,
		and,
		or,
		end,
		return,
		if,
		then,
		else,
		function,
		begin,
		to,
		new,
		input,
		output
	},
	morecomment=[l]{/*},
	morecomment=[l]{//},
	% basicstyle=\ttfamily,
	% keywordstyle=\color{blue}, %\ttfamily,
	commentstyle=\color{gray}\it,
	keywordstyle=\bf,
	rulecolor=\color{black},
	literate=%
		{!=}{{$\neq$}}1
		{<=}{{$\leq$}}1
		{>=}{{$\geq$}}1
		{->}{{$\rightarrow$}}1
		{<-}{{$\leftarrow$}}1
}

%	},
% 	tabsize=3,
% 	sensitive=false,
% 	morecomment=[l]{#},
% 	morestring=[b]",
% 	extendedchars=true,
% 	inputencoding=utf8,
% 	literate=%
% 		{!=}{{$\neq$}}1
% 		{<=}{{$\leq$}}1
% 		{>=}{{$\geq$}}1
% 		{<>}{{$\neq$}}1
% 		{:=}{{$\ \leftarrow\quad$}}1
% 		{Ö}{{\"O}}1
% 		{Ä}{{\"A}}1
% 		{Ü}{{\"U}}1
% 		{ß}{{\ss{}}}1
% 		{ü}{{\"u}}1
% 		{ä}{{\"a}}1
% 		{ö}{{\"o}}1
% 		{~}{{\textasciitilde}}1,
% 	texcl=true % use all chars from \usepackage[utf8]{inputenc}
% }
\lstset{
    tabsize=4,
	xleftmargin=0pt,    % left margin
	numbers=left,       % linenumber position
    numbersep=15pt,     % left linenumber padding
	numberstyle=\tiny,
	basicstyle=\ttfamily,
	keywordstyle=\color{black!60},
	commentstyle=\ttfamily\color{gray!70},
	breaklines=true,
	literate=
}

\renewcommand{\epsilon}{\varepsilon}

\newcommand{\vecl}{\ensuremath{\operatorname{vec}_l}}
\newcommand{\Sym}{\ensuremath{\operatorname{Sym}}}

\renewcommand{\vec}{\operatorname{vec}}
\newcommand{\devec}{\operatorname*{devec}}
\newcommand{\svec}{\operatorname{svec}}
\newcommand{\sym}{\operatorname{sym}}
\renewcommand{\skew}{\operatorname{skew}}
\newcommand{\rowSums}{\operatorname{rowSums}}
\newcommand{\colSums}{\operatorname{colSums}}
\newcommand{\diag}{\operatorname{diag}}

\begin{document}

\section{Kronecker Product Properties}
The \emph{mixed-product} property for matrices $A, B, C, D$ holds if and only if the following matrix products are well defined
\begin{displaymath}
	(A\otimes B)(C \otimes D) = (A C) \otimes (B C).
\end{displaymath}
In combination with the \emph{Hadamard product} (element-wise multiplication) for matrices $A, C$ of the same size as well as $B, D$ of the same size is
\begin{displaymath}
	(A\otimes B)\circ (C \otimes D) = (A \circ C) \otimes (B \circ D).
\end{displaymath}
The \emph{transpose} of the Kronecker product fulfills
\begin{displaymath}
	(A\otimes B)^T = A^T \otimes B^T
\end{displaymath}

\section{Distance Computation}
The pair-wise distances $d_V(X_{i,:}, X_{j,:})$ arranged in the distance matrix $D\in\mathbb{R}^{n\times n}$ can be written as
\begin{align*}
	\vec(D) = \rowSums(((X Q)\otimes 1_n - 1_n \otimes (X Q))^2)
\end{align*}
This can be computed in $\mathcal{O}(n^2p + np^2)$ time (vectorization and devectorization takes $\mathcal{O}(1)$).

The matrices $K, W$ are define through there elements as
\begin{displaymath}
	k_{i j} = \exp\left(-\frac{d_{i j}^2}{2 h^2}\right),\qquad w_{i j} = \frac{k_{i j}}{\sum_{m} k_{m j}}.
\end{displaymath}

Next are $\bar{y}^{(m)}$ and the ``element-wise'' loss $l_i = L_n(V, X_i)$.
\begin{displaymath}
	\bar{y}^{(m)} = W^T Y^m,\qquad l = \bar{y}^{(2)} - (\bar{y}^{(1)})^2
\end{displaymath}

\section{Gradient Computation}
The model
\begin{displaymath}
	Y \sim g(B^T X) + \epsilon.
\end{displaymath}

Assume a data set $(X_i, Y_i)$ for $i = 1, ..., n$ with $X$ a $n\times p$ matrix such that each row represents one sample. Now let $l_i = L_n(V, X_i)$, $\bar{y}^{(1)}_j = (W^T Y)_j$ as well as $d_{i j}, w_{i j}$ the distance and weight matrix components. Then the gradient for the ``simple'' CVE method is given as
\begin{displaymath}
	\nabla L_n(V) = \frac{1}{nh^2}\sum_{i = 1}^{n} \sum_{j = 1}^{n} (l_j - (Y_i - \bar{y}^{(1)}_j)^2) w_{i j} d_{i j} \nabla_V d_V(X_{i,:}, X_{j,:}).
\end{displaymath}
This representation is cumbersome and a direct implementation has a asymptotic run-time of $\Theta(n^2p^2)$ because it is a double sum over $n$, therefore quadratic in $n$, and the form of $\nabla_V d_V$.

This can be optimized and written in matrix notation. First the distance gradient is given as
\begin{displaymath}
	\nabla_V d_V(X_{i,:}, X_{j,:}) = -2 (X_{i,:} - X_{j,:})^T (X_{i,:} - X_{j,:}) V
\end{displaymath}
(Note: $X_{i,:}\in\mathbb{R}^{1\times p}$, aka a row representing one sample). In addition define the $n\times n$ matrix $S$ through its elements
\begin{displaymath}
	s_{i j} = (l_j - (Y_i - \bar{y}^{(1)}_j)^2) w_{i j} d_{i j}.
\end{displaymath}
Substitution in the gradient leads to
\begin{align*}
	\nabla L_n(V)
		&= -\frac{2}{nh^2}\sum_{i = 1}^{n} \sum_{j = 1}^{n} s_{i j} (X_{i,:} - X_{j,:})^T (X_{i,:} - X_{j,:}) V \\
		&= -\frac{2}{nh^2}\sum_{i = 1}^{n} \sum_{j = 1}^{n} s_{i j} \left( X_{i,:}^T X_{i,:} - X_{i,:}^T X_{j,:} - X_{j,:}^T X_{i,:} + X_{j,:}^T X_{j,:} \right) V \\
		&= -\frac{2}{nh^2} \left( \sum_{i = 1}^{n}\sum_{j = 1}^{n} (s_{i j} + s_{j i}) X_{i,:}^T X_{i,:} - \sum_{i = 1}^{n}\sum_{j = 1}^{n} (s_{i j} + s_{j i}) X_{i,:}^T X_{j,:} \right) V \\
		&= -\frac{2}{nh^2} \left( X^T \diag(\colSums(S + S^T)) X - X^T (S + S^T) X \right) V \\
		&= -\frac{2}{nh^2} X^T \left( \diag(\colSums(S + S^T)) - (S + S^T) \right) X V
\end{align*}

\begin{center}{\bf
	ATTENTION: The given R examples are to illustrate the inplementation in C which is 0-indexed!
}\end{center}

The \emph{vertorization} operation maps a matrix $A\in\mathbb{R}^{n\times m}$ into $\mathbb{R}^{nm}$ by stacking the columns of $A$;
\begin{displaymath}
	\vec(A) = (a_{0,0}, a_{0,1}, a_{0,2},...,a_{0,n-1},a_{1,0},a_{1,1},...,a_{n-1,n-1})^T.
\end{displaymath}
The relation $\vec(A)_k = a_{i,j}$ holds for $k=nj+i$ such that $0\leq k < n^2$ and $0\leq i < n, 0 \leq j < m$. This operation is obviously a bijection. When going ``backwards'' the dimension of the original space is required, therefore let $\devec_n$ be the operation such that $\devec_n(\vec(A)) = A$ for $A\in\mathbb{R}^{n\times m}$.\footnote{Note that for $B\in\mathbb{R}^{p\times q}$ such that $pq = nm$ the $\devec_n(\vec(B))\in\mathbb{R}^{n\times m}$.}

For symmetric matrices the information stored in $a_{i,j} = a_{j,i}$ is twice stored in $A=A^T\in\mathbb{R}^{n\times n}$, to remove this redundency the \emph{symmetric vectorization} is defined which saves the main diagonal and the lower triangular part of the symmetric matrix according the scema
\begin{displaymath}
	\svec(A) = (a_{0,0},2a_{1,0},2a_{2,n},...,2a_{n-1,0},a_{1,1},2a_{2,1},...,2a_{n-1,1},a_{2,2},...,a_{n-1,n-1})
\end{displaymath}
A it more formal
\begin{displaymath}
	\svec(A)_{k} = (2-\delta_{i,j})a_{i,j} \quad\text{for}\quad k = n j + i - \frac{j(j + 1)}{2}, 0\leq j \leq i < n^2.
\end{displaymath}

\begin{lstlisting}[language=R]
n <- 3
k <- function(i, j, n) { (j * n) + i - (j * (j + 1) / 2) }
i <- function(n) { rep(1:n - 1, n) }
j <- function(n) { rep(1:n - 1, each = n) }
A <- matrix(k(i(n), j(n), n), n)
A[which(j(n) > i(n))] <- NA
A
#      [,1] [,2] [,3]
# [1,]    0   NA   NA
# [2,]    1    3   NA
# [3,]    2    4    5
vec <- function(A) { as.vector(A) }
svec <- function(A) {
	n <- nrow(A)
	((2 - (i(n) == j(n))) * A)[i(n) >= j(n)]
}
svec(matrix(1, n, n))
# [1] 1 2 2 1 2 1
devec <- function(vec, n) { matrix(vec, n) }
\end{lstlisting}

For a quadratic matrix $A\in\mathbb{R}^{n\times n}$ we define
\begin{displaymath}
	\sym(A) := \frac{A + A^T}{2}, \qquad \skew(A) := \frac{A - A^T}{2}.
\end{displaymath}

% For a Matrix $A\in\mathbb{R}^{n\times n}$ the \emph{vectorization} operation is defined as a mapping from the matrices into a 

% Indexing a given matrix $A = (a_{ij})_{i,j = 1, ..., n} \in \mathbb{R}^{n\times n}$ given as
% \begin{displaymath}
%     A = \begin{pmatrix}
%         a_{0,0} & a_{0,1} & a_{0,2} & \ldots & a_{0,n-1} \\
%         a_{1,0} & a_{1,1} & a_{1,2} & \ldots & a_{1,n-1} \\
%         a_{2,0} & a_{2,1} & a_{2,2} & \ldots & a_{2,n-1} \\
%         \vdots & \vdots & \vdots & \ddots & \vdots \\
%         a_{n-1,0} & a_{n-1,1} & a_{n-1,2} & \ldots & a_{n-1,n-1}
%     \end{pmatrix}
% \end{displaymath}

% A symmetric matrix with zero main diagonal, meaning a matrix $S = S^T$ with $S_{i,i} = 0,\ \forall i = 1,..,n$ is given in the following form
% \begin{displaymath}
%     S = \begin{pmatrix}
%         0 & s_{1,0} & s_{2,0} & \ldots & s_{n-1,0} \\
%         s_{1,0} & 0 & s_{2,1} & \ldots & s_{n-1,1} \\
%         s_{2,0} & s_{2,1} & 0 & \ldots & s_{n-1,2} \\
%         \vdots & \vdots & \vdots & \ddots & \vdots \\
%         s_{n-1,0} & s_{n-1,1} & s_{n-1,2} & \ldots & 0
%     \end{pmatrix}
% \end{displaymath}
% Therefore its sufficient to store only the lower triangular part, for memory efficiency and some further algorithmic shortcuts (sometime they are more expensive) the symmetric matrix $S$ is stored in packed form, meaning in a vector of the length $\frac{n(n-1)}{2}$. We use (like for matrices) a column-major order of elements and define the $\vecl:\Sym(n)\to \mathbb{R}^{n(n-1) / 2}$ operator defined as

% \begin{displaymath}
%     \vecl(S) = (s_{1,0}, s_{2,0},\cdots,s_{n-1,0},s_{2,1}\cdots,s_{n-1,n-2})^T
% \end{displaymath}

% The relation between the matrix indices $i,j$ and the $\vecl$ index $k$ is given by

% \begin{displaymath}
%     (\vecl(S)_k = s_{i,j} \quad\Leftrightarrow\quad k = jn+i) : j \in \{0,...,n-2\} \land j < i < n.
% \end{displaymath}

% \begin{center}
%     \begin{tikzpicture}[xscale=1,yscale=-1]
%         % \foreach \i in {0,...,5} {
%         %     \node at ({mod(\i, 3)}, {int(\i / 3)}) {$\i$};
%         % }
%         \foreach \i in {1,...,4} {
%             \foreach \j in {1,...,\i} {
%                 \node at (\j, \i) {$\i,\j$};
%             }
%         }
        
%     \end{tikzpicture}
% \end{center}

\newpage
\section{Algorithm}
The basic algorithm reads as follows:

Mit
\begin{displaymath}
	X_{diff} := X\otimes 1_n - 1_n\otimes X
\end{displaymath}
gilt
\begin{displaymath}
	X_{diff}Q := (X\otimes 1_n - 1_n\otimes X)Q = XQ\otimes 1_n - 1_n\otimes XQ
\end{displaymath}

\newcommand{\rStiefel}{\operatorname{rStiefel}}
% \lstset{language=PseudoCode}
% \begin{lstlisting}[mathescape, caption=Erste Phase von \texttt{HDE} (siehe \cite{HDE}), label=code:HDE, captionpos=b]
% \begin{lstlisting}[mathescape]
% // Hallo Welt
% /* Hallo comment */
% $X_{diff} \leftarrow X\otimes 1_n - 1_n\otimes X$

% for attempt from 1 to attempts do
% 	if $\exists V_{init}$ then
% 		$V \leftarrow V_{init}$
% 	else
% 		$V \leftarrow \rStiefel(p, q)$
% 	end if

% 	/* Projection matrix into null space */
% 	$Q \leftarrow I_p - VV^T$

% 	/* Pair-wise distances (row sum of squared elements) */
% 	$D \leftarrow$ foreach $i,j=1,...,n$ as $D_{i,j}\leftarrow \|(X_{i,:}-X_{j,:})Q\|_2^2$

% 	/* Weights */
% 	$W \leftarrow$ foreach $i,j=1,...,n$ as $W_{i,j} \leftarrow \frac{k(D_{i,j})}{\sum_{i} k(D_{i,j})}$

% 	$\bar{y}_1 \leftarrow W^TY$
% 	$\bar{y}_2 \leftarrow W^T(Y\odot Y)$

% 	/* Element-wise losses */
% 	$L \leftarrow \bar{y}_2 - \bar{y}_1^2$

% 	for epoch from 1 to epochs do

% 		$G_t \leftarrow \gamma G_{t-1} + (1-\gamma) \nabla_c L(V)$

% 	end for
% end for
% \end{lstlisting}

The loss at a given position is
\begin{displaymath}
	L_n(V) = \frac{1}{nh^2}\sum_{i = 0}^{n - 1} \sum_{j = 0}^{n - 1} (L_j - (Y_i - \bar{y}^{(1)}_j)^2) w_{i j} d_{i j} \nabla_V d_V(X_{i,:}, X_{j,:})
\end{displaymath}
Now let the matrix $S$ be defined through its coefficients
\begin{displaymath}
	s_{i j} = (L_j - (Y_i - \bar{y}^{(1)}_j)^2) w_{i j} d_{i j}
\end{displaymath}
This matrix is \underline{not} symmetric but we can consider the symmetric $S + S^T$ with a zero main diagonal because $D$ has a zero main diagonal, meaning $s_{i i} = 0$ because $d_{i i} = 0$ for each $i$. Therefore the following holds due to the fact that $\nabla_V d_V(X_{i,:}, X_{j,:}) = \nabla_V d_V(X_{j,:}, X_{i,:})$.
\begin{displaymath}
	L_n(V) = \frac{1}{nh^2}\sum_{j = 0}^{n - 1} \sum_{i = j}^{n - 1} (s_{i j} + s_{j i}) \nabla_V d_V(X_{i,:}, X_{j,:})
\end{displaymath}
Note the summation indices $0 \leq j \leq i < n$. Substitution with $\nabla_V d_V(X_{i,:}, X_{j,:}) = -2 (X_{i,:} - X_{j,:})^T(X_{i,:} - X_{j,:}) V$ evaluates to
\begin{displaymath}
	L_n(V) = -\frac{2}{nh^2}\sum_{j = 0}^{n - 1} \sum_{i = j}^{n - 1} (s_{i j} + s_{j i}) (X_{i,:} - X_{j,:})^T(X_{i,:} - X_{j,:}) V
\end{displaymath}
Let $X_{-}$ be the matrix containing all pairs of $X_{i,:}$ to $X_{j,:}$ differences using the same row indexing scheme as the symmetric vectorization.
\begin{displaymath}
	(X_{-})_{k,:} = X_{i,:} - X_{j,:} \quad\text{for}\quad k = n j + i - \frac{j(j + 1)}{2}, 0\leq j \leq i < n^2
\end{displaymath}
With the $X_{-}$ matrix the above double sum can be formalized in matrix notation as follows\footnote{only valid cause $s_{i i} = 0$}
\begin{displaymath}
	L_n(V) = -\frac{2}{nh^2} X_{-}^T(\svec(\sym(S)) \circ_r X_{-}) V
\end{displaymath}
where $\circ_r$ means the ``recycled'' hadamard product, this is for a vector $x\in\mathbb{R}^n$ and a Matrix $M\in\mathbb{R}^{n\times m}$ just the element wise multiplication for each column of $M$ with $x$, or equivalent $x\circ_r M = \underbrace{(x, x, ..., x)}_{{n\times m}} \circ M$ where $\circ$ is the element-wise product.


\begin{lstlisting}[mathescape, language=PseudoCode]
/* Starting value and initial gradient. */
$V_1 \leftarrow V_{init}$ if $\exists V_{init}$ else $\rStiefel(p, q)$
$G_1 \leftarrow (1 - \mu) \nabla L_n(V_0)$

/* Optimization loop */
$t \leftarrow 1$
while $t\leq\,$max.iter do

	/* Update on stiefel manifold. */
	$A \leftarrow G_tV_t^T - V_tG_t^T$
	$V_{t+1} \leftarrow (I_p + \tau A)^{-1}(I_p - \tau A)V_{t}$

	/* Check break condition. */
	if $\|V_{t+1}V_{t+1}^T - V_{t}^TV_{t}\|_2^2 \leq \sqrt{2q}\,$tol then
		break
	end if
	
	/* Check for decrease. */
	if $L_n(V_{t+1}) - L_n(V_{t}) > L_n(V_{t})\,$slack then // TODO: slack?
		/* Reduce step-size. */
		$\tau \leftarrow \gamma\tau$
	else
		/* Gradient at next position (with momentum). */
		$G_{t+1} \leftarrow \mu G_{t} + (1 - \mu) \nabla L_n(V_{t+1})$
		/* Increase step index */
		$t \leftarrow t + 1$
	end if

end while
\end{lstlisting}
	

\end{document}
cleanup 2019-09-25 12:49:12 +00:00			`\documentclass[12pt,a4paper]{article}`

			`\usepackage[utf8]{inputenc}`
			`\usepackage[T1]{fontenc}`
			`\usepackage{amsmath, amsfonts, amssymb, amsthm}`
wip: benchmarking, wip: doc 2019-10-18 07:06:36 +00:00			`\usepackage{tikz}`
fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00			`\usepackage{listings}`
cleanup 2019-09-25 12:49:12 +00:00			`\usepackage{fullpage}`

fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00
			`\lstdefinelanguage{PseudoCode} {`
			`morekeywords={`
			`for,`
			`while,`
			`repeat,`
			`from,`
			`each,`
			`foreach,`
			`break,`
			`continue,`
			`in,`
			`do,`
			`as,`
			`and,`
			`or,`
			`end,`
			`return,`
			`if,`
			`then,`
			`else,`
			`function,`
			`begin,`
			`to,`
			`new,`
			`input,`
			`output`
			`},`
			`morecomment=[l]{/*},`
			`morecomment=[l]{//},`
			`% basicstyle=\ttfamily,`
			`% keywordstyle=\color{blue}, %\ttfamily,`
			`commentstyle=\color{gray}\it,`
			`keywordstyle=\bf,`
			`rulecolor=\color{black},`
			`literate=%`
			`{!=}{{$\neq$}}1`
			`{<=}{{$\leq$}}1`
			`{>=}{{$\geq$}}1`
			`{->}{{$\rightarrow$}}1`
			`{<-}{{$\leftarrow$}}1`
			`}`

			`% },`
			`% tabsize=3,`
			`% sensitive=false,`
			`% morecomment=[l]{#},`
			`% morestring=[b]",`
			`% extendedchars=true,`
			`% inputencoding=utf8,`
			`% literate=%`
			`% {!=}{{$\neq$}}1`
			`% {<=}{{$\leq$}}1`
			`% {>=}{{$\geq$}}1`
			`% {<>}{{$\neq$}}1`
			`% {:=}{{$\ \leftarrow\quad$}}1`
			`% {Ö}{{\"O}}1`
			`% {Ä}{{\"A}}1`
			`% {Ü}{{\"U}}1`
			`% {ß}{{\ss{}}}1`
			`% {ü}{{\"u}}1`
			`% {ä}{{\"a}}1`
			`% {ö}{{\"o}}1`
			`% {~}{{\textasciitilde}}1,`
			`% texcl=true % use all chars from \usepackage[utf8]{inputenc}`
			`% }`
			`\lstset{`
			`tabsize=4,`
			`xleftmargin=0pt, % left margin`
			`numbers=left, % linenumber position`
			`numbersep=15pt, % left linenumber padding`
			`numberstyle=\tiny,`
			`basicstyle=\ttfamily,`
			`keywordstyle=\color{black!60},`
			`commentstyle=\ttfamily\color{gray!70},`
			`breaklines=true,`
			`literate=`
			`}`

			`\renewcommand{\epsilon}{\varepsilon}`

cleanup 2019-09-25 12:49:12 +00:00			`\newcommand{\vecl}{\ensuremath{\operatorname{vec}_l}}`
			`\newcommand{\Sym}{\ensuremath{\operatorname{Sym}}}`

fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00			`\renewcommand{\vec}{\operatorname{vec}}`
			`\newcommand{\devec}{\operatorname*{devec}}`
			`\newcommand{\svec}{\operatorname{svec}}`
			`\newcommand{\sym}{\operatorname{sym}}`
			`\renewcommand{\skew}{\operatorname{skew}}`
			`\newcommand{\rowSums}{\operatorname{rowSums}}`
			`\newcommand{\colSums}{\operatorname{colSums}}`
			`\newcommand{\diag}{\operatorname{diag}}`

cleanup 2019-09-25 12:49:12 +00:00			`\begin{document}`

fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00			`\section{Kronecker Product Properties}`
			`The \emph{mixed-product} property for matrices $A, B, C, D$ holds if and only if the following matrix products are well defined`
			`\begin{displaymath}`
			`(A\otimes B)(C \otimes D) = (A C) \otimes (B C).`
			`\end{displaymath}`
			`In combination with the \emph{Hadamard product} (element-wise multiplication) for matrices $A, C$ of the same size as well as $B, D$ of the same size is`
			`\begin{displaymath}`
			`(A\otimes B)\circ (C \otimes D) = (A \circ C) \otimes (B \circ D).`
			`\end{displaymath}`
			`The \emph{transpose} of the Kronecker product fulfills`
			`\begin{displaymath}`
			`(A\otimes B)^T = A^T \otimes B^T`
			`\end{displaymath}`

			`\section{Distance Computation}`
			`The pair-wise distances $d_V(X_{i,:}, X_{j,:})$ arranged in the distance matrix $D\in\mathbb{R}^{n\times n}$ can be written as`
			`\begin{align*}`
			`\vec(D) = \rowSums(((X Q)\otimes 1_n - 1_n \otimes (X Q))^2)`
			`\end{align*}`
			`This can be computed in $\mathcal{O}(n^2p + np^2)$ time (vectorization and devectorization takes $\mathcal{O}(1)$).`

			`The matrices $K, W$ are define through there elements as`
			`\begin{displaymath}`
			`k_{i j} = \exp\left(-\frac{d_{i j}^2}{2 h^2}\right),\qquad w_{i j} = \frac{k_{i j}}{\sum_{m} k_{m j}}.`
			`\end{displaymath}`

			Next are $\bar{y}^{(m)}$ and the ``element-wise'' loss $l_i = L_n(V, X_i)$.
			`\begin{displaymath}`
			`\bar{y}^{(m)} = W^T Y^m,\qquad l = \bar{y}^{(2)} - (\bar{y}^{(1)})^2`
			`\end{displaymath}`

			`\section{Gradient Computation}`
			`The model`
			`\begin{displaymath}`
			`Y \sim g(B^T X) + \epsilon.`
			`\end{displaymath}`

			Assume a data set $(X_i, Y_i)$ for $i = 1, ..., n$ with $X$ a $n\times p$ matrix such that each row represents one sample. Now let $l_i = L_n(V, X_i)$, $\bar{y}^{(1)}_j = (W^T Y)_j$ as well as $d_{i j}, w_{i j}$ the distance and weight matrix components. Then the gradient for the ``simple'' CVE method is given as
			`\begin{displaymath}`
			`\nabla L_n(V) = \frac{1}{nh^2}\sum_{i = 1}^{n} \sum_{j = 1}^{n} (l_j - (Y_i - \bar{y}^{(1)}_j)^2) w_{i j} d_{i j} \nabla_V d_V(X_{i,:}, X_{j,:}).`
			`\end{displaymath}`
			`This representation is cumbersome and a direct implementation has a asymptotic run-time of $\Theta(n^2p^2)$ because it is a double sum over $n$, therefore quadratic in $n$, and the form of $\nabla_V d_V$.`

			`This can be optimized and written in matrix notation. First the distance gradient is given as`
			`\begin{displaymath}`
			`\nabla_V d_V(X_{i,:}, X_{j,:}) = -2 (X_{i,:} - X_{j,:})^T (X_{i,:} - X_{j,:}) V`
			`\end{displaymath}`
			`(Note: $X_{i,:}\in\mathbb{R}^{1\times p}$, aka a row representing one sample). In addition define the $n\times n$ matrix $S$ through its elements`
cleanup 2019-09-25 12:49:12 +00:00			`\begin{displaymath}`
fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00			`s_{i j} = (l_j - (Y_i - \bar{y}^{(1)}_j)^2) w_{i j} d_{i j}.`
cleanup 2019-09-25 12:49:12 +00:00			`\end{displaymath}`
fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00			`Substitution in the gradient leads to`
			`\begin{align*}`
			`\nabla L_n(V)`
			`&= -\frac{2}{nh^2}\sum_{i = 1}^{n} \sum_{j = 1}^{n} s_{i j} (X_{i,:} - X_{j,:})^T (X_{i,:} - X_{j,:}) V \\`
			`&= -\frac{2}{nh^2}\sum_{i = 1}^{n} \sum_{j = 1}^{n} s_{i j} \left( X_{i,:}^T X_{i,:} - X_{i,:}^T X_{j,:} - X_{j,:}^T X_{i,:} + X_{j,:}^T X_{j,:} \right) V \\`
			`&= -\frac{2}{nh^2} \left( \sum_{i = 1}^{n}\sum_{j = 1}^{n} (s_{i j} + s_{j i}) X_{i,:}^T X_{i,:} - \sum_{i = 1}^{n}\sum_{j = 1}^{n} (s_{i j} + s_{j i}) X_{i,:}^T X_{j,:} \right) V \\`
			`&= -\frac{2}{nh^2} \left( X^T \diag(\colSums(S + S^T)) X - X^T (S + S^T) X \right) V \\`
			`&= -\frac{2}{nh^2} X^T \left( \diag(\colSums(S + S^T)) - (S + S^T) \right) X V`
			`\end{align*}`
cleanup 2019-09-25 12:49:12 +00:00
fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00			`\begin{center}{\bf`
			`ATTENTION: The given R examples are to illustrate the inplementation in C which is 0-indexed!`
			`}\end{center}`

			`The \emph{vertorization} operation maps a matrix $A\in\mathbb{R}^{n\times m}$ into $\mathbb{R}^{nm}$ by stacking the columns of $A$;`
cleanup 2019-09-25 12:49:12 +00:00			`\begin{displaymath}`
fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00			`\vec(A) = (a_{0,0}, a_{0,1}, a_{0,2},...,a_{0,n-1},a_{1,0},a_{1,1},...,a_{n-1,n-1})^T.`
cleanup 2019-09-25 12:49:12 +00:00			`\end{displaymath}`
fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00			The relation $\vec(A)_k = a_{i,j}$ holds for $k=nj+i$ such that $0\leq k < n^2$ and $0\leq i < n, 0 \leq j < m$. This operation is obviously a bijection. When going ``backwards'' the dimension of the original space is required, therefore let $\devec_n$ be the operation such that $\devec_n(\vec(A)) = A$ for $A\in\mathbb{R}^{n\times m}$.\footnote{Note that for $B\in\mathbb{R}^{p\times q}$ such that $pq = nm$ the $\devec_n(\vec(B))\in\mathbb{R}^{n\times m}$.}
cleanup 2019-09-25 12:49:12 +00:00
fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00			`For symmetric matrices the information stored in $a_{i,j} = a_{j,i}$ is twice stored in $A=A^T\in\mathbb{R}^{n\times n}$, to remove this redundency the \emph{symmetric vectorization} is defined which saves the main diagonal and the lower triangular part of the symmetric matrix according the scema`
			`\begin{displaymath}`
			`\svec(A) = (a_{0,0},2a_{1,0},2a_{2,n},...,2a_{n-1,0},a_{1,1},2a_{2,1},...,2a_{n-1,1},a_{2,2},...,a_{n-1,n-1})`
			`\end{displaymath}`
			`A it more formal`
cleanup 2019-09-25 12:49:12 +00:00			`\begin{displaymath}`
fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00			`\svec(A)_{k} = (2-\delta_{i,j})a_{i,j} \quad\text{for}\quad k = n j + i - \frac{j(j + 1)}{2}, 0\leq j \leq i < n^2.`
cleanup 2019-09-25 12:49:12 +00:00			`\end{displaymath}`

fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00			`\begin{lstlisting}[language=R]`
			`n <- 3`
			`k <- function(i, j, n) { (j * n) + i - (j * (j + 1) / 2) }`
			`i <- function(n) { rep(1:n - 1, n) }`
			`j <- function(n) { rep(1:n - 1, each = n) }`
			`A <- matrix(k(i(n), j(n), n), n)`
			`A[which(j(n) > i(n))] <- NA`
			`A`
			`# [,1] [,2] [,3]`
			`# [1,] 0 NA NA`
			`# [2,] 1 3 NA`
			`# [3,] 2 4 5`
			`vec <- function(A) { as.vector(A) }`
			`svec <- function(A) {`
			`n <- nrow(A)`
			`((2 - (i(n) == j(n))) * A)[i(n) >= j(n)]`
			`}`
			`svec(matrix(1, n, n))`
			`# [1] 1 2 2 1 2 1`
			`devec <- function(vec, n) { matrix(vec, n) }`
			`\end{lstlisting}`
cleanup 2019-09-25 12:49:12 +00:00
fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00			`For a quadratic matrix $A\in\mathbb{R}^{n\times n}$ we define`
cleanup 2019-09-25 12:49:12 +00:00			`\begin{displaymath}`
fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00			`\sym(A) := \frac{A + A^T}{2}, \qquad \skew(A) := \frac{A - A^T}{2}.`
cleanup 2019-09-25 12:49:12 +00:00			`\end{displaymath}`

fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00			`% For a Matrix $A\in\mathbb{R}^{n\times n}$ the \emph{vectorization} operation is defined as a mapping from the matrices into a`

			`% Indexing a given matrix $A = (a_{ij})_{i,j = 1, ..., n} \in \mathbb{R}^{n\times n}$ given as`
			`% \begin{displaymath}`
			`% A = \begin{pmatrix}`
			`% a_{0,0} & a_{0,1} & a_{0,2} & \ldots & a_{0,n-1} \\`
			`% a_{1,0} & a_{1,1} & a_{1,2} & \ldots & a_{1,n-1} \\`
			`% a_{2,0} & a_{2,1} & a_{2,2} & \ldots & a_{2,n-1} \\`
			`% \vdots & \vdots & \vdots & \ddots & \vdots \\`
			`% a_{n-1,0} & a_{n-1,1} & a_{n-1,2} & \ldots & a_{n-1,n-1}`
			`% \end{pmatrix}`
			`% \end{displaymath}`

			`% A symmetric matrix with zero main diagonal, meaning a matrix $S = S^T$ with $S_{i,i} = 0,\ \forall i = 1,..,n$ is given in the following form`
			`% \begin{displaymath}`
			`% S = \begin{pmatrix}`
			`% 0 & s_{1,0} & s_{2,0} & \ldots & s_{n-1,0} \\`
			`% s_{1,0} & 0 & s_{2,1} & \ldots & s_{n-1,1} \\`
			`% s_{2,0} & s_{2,1} & 0 & \ldots & s_{n-1,2} \\`
			`% \vdots & \vdots & \vdots & \ddots & \vdots \\`
			`% s_{n-1,0} & s_{n-1,1} & s_{n-1,2} & \ldots & 0`
			`% \end{pmatrix}`
			`% \end{displaymath}`
			`% Therefore its sufficient to store only the lower triangular part, for memory efficiency and some further algorithmic shortcuts (sometime they are more expensive) the symmetric matrix $S$ is stored in packed form, meaning in a vector of the length $\frac{n(n-1)}{2}$. We use (like for matrices) a column-major order of elements and define the $\vecl:\Sym(n)\to \mathbb{R}^{n(n-1) / 2}$ operator defined as`

			`% \begin{displaymath}`
			`% \vecl(S) = (s_{1,0}, s_{2,0},\cdots,s_{n-1,0},s_{2,1}\cdots,s_{n-1,n-2})^T`
			`% \end{displaymath}`

			`% The relation between the matrix indices $i,j$ and the $\vecl$ index $k$ is given by`

			`% \begin{displaymath}`
			`% (\vecl(S)_k = s_{i,j} \quad\Leftrightarrow\quad k = jn+i) : j \in \{0,...,n-2\} \land j < i < n.`
			`% \end{displaymath}`

			`% \begin{center}`
			`% \begin{tikzpicture}[xscale=1,yscale=-1]`
			`% % \foreach \i in {0,...,5} {`
			`% % \node at ({mod(\i, 3)}, {int(\i / 3)}) {$\i$};`
			`% % }`
			`% \foreach \i in {1,...,4} {`
			`% \foreach \j in {1,...,\i} {`
			`% \node at (\j, \i) {$\i,\j$};`
			`% }`
			`% }`
wip: benchmarking, wip: doc 2019-10-18 07:06:36 +00:00
fix: momentum bug, wip: datasets, notes, ... 2019-12-10 07:45:07 +00:00			`% \end{tikzpicture}`
			`% \end{center}`

			`\newpage`
			`\section{Algorithm}`
			`The basic algorithm reads as follows:`

			`Mit`
			`\begin{displaymath}`
			`X_{diff} := X\otimes 1_n - 1_n\otimes X`
			`\end{displaymath}`
			`gilt`
			`\begin{displaymath}`
			`X_{diff}Q := (X\otimes 1_n - 1_n\otimes X)Q = XQ\otimes 1_n - 1_n\otimes XQ`
			`\end{displaymath}`

			`\newcommand{\rStiefel}{\operatorname{rStiefel}}`
			`% \lstset{language=PseudoCode}`
			`% \begin{lstlisting}[mathescape, caption=Erste Phase von \texttt{HDE} (siehe \cite{HDE}), label=code:HDE, captionpos=b]`
			`% \begin{lstlisting}[mathescape]`
			`% // Hallo Welt`
			`% /* Hallo comment */`
			`% $X_{diff} \leftarrow X\otimes 1_n - 1_n\otimes X$`

			`% for attempt from 1 to attempts do`
			`% if $\exists V_{init}$ then`
			`% $V \leftarrow V_{init}$`
			`% else`
			`% $V \leftarrow \rStiefel(p, q)$`
			`% end if`

			`% /* Projection matrix into null space */`
			`% $Q \leftarrow I_p - VV^T$`

			`% /* Pair-wise distances (row sum of squared elements) */`
			`% $D \leftarrow$ foreach $i,j=1,...,n$ as $D_{i,j}\leftarrow \\|(X_{i,:}-X_{j,:})Q\\|_2^2$`

			`% /* Weights */`
			`% $W \leftarrow$ foreach $i,j=1,...,n$ as $W_{i,j} \leftarrow \frac{k(D_{i,j})}{\sum_{i} k(D_{i,j})}$`

			`% $\bar{y}_1 \leftarrow W^TY$`
			`% $\bar{y}_2 \leftarrow W^T(Y\odot Y)$`

			`% /* Element-wise losses */`
			`% $L \leftarrow \bar{y}_2 - \bar{y}_1^2$`

			`% for epoch from 1 to epochs do`

			`% $G_t \leftarrow \gamma G_{t-1} + (1-\gamma) \nabla_c L(V)$`

			`% end for`
			`% end for`
			`% \end{lstlisting}`

			`The loss at a given position is`
			`\begin{displaymath}`
			`L_n(V) = \frac{1}{nh^2}\sum_{i = 0}^{n - 1} \sum_{j = 0}^{n - 1} (L_j - (Y_i - \bar{y}^{(1)}_j)^2) w_{i j} d_{i j} \nabla_V d_V(X_{i,:}, X_{j,:})`
			`\end{displaymath}`
			`Now let the matrix $S$ be defined through its coefficients`
			`\begin{displaymath}`
			`s_{i j} = (L_j - (Y_i - \bar{y}^{(1)}_j)^2) w_{i j} d_{i j}`
			`\end{displaymath}`
			`This matrix is \underline{not} symmetric but we can consider the symmetric $S + S^T$ with a zero main diagonal because $D$ has a zero main diagonal, meaning $s_{i i} = 0$ because $d_{i i} = 0$ for each $i$. Therefore the following holds due to the fact that $\nabla_V d_V(X_{i,:}, X_{j,:}) = \nabla_V d_V(X_{j,:}, X_{i,:})$.`
			`\begin{displaymath}`
			`L_n(V) = \frac{1}{nh^2}\sum_{j = 0}^{n - 1} \sum_{i = j}^{n - 1} (s_{i j} + s_{j i}) \nabla_V d_V(X_{i,:}, X_{j,:})`
			`\end{displaymath}`
			`Note the summation indices $0 \leq j \leq i < n$. Substitution with $\nabla_V d_V(X_{i,:}, X_{j,:}) = -2 (X_{i,:} - X_{j,:})^T(X_{i,:} - X_{j,:}) V$ evaluates to`
			`\begin{displaymath}`
			`L_n(V) = -\frac{2}{nh^2}\sum_{j = 0}^{n - 1} \sum_{i = j}^{n - 1} (s_{i j} + s_{j i}) (X_{i,:} - X_{j,:})^T(X_{i,:} - X_{j,:}) V`
			`\end{displaymath}`
			`Let $X_{-}$ be the matrix containing all pairs of $X_{i,:}$ to $X_{j,:}$ differences using the same row indexing scheme as the symmetric vectorization.`
			`\begin{displaymath}`
			`(X_{-})_{k,:} = X_{i,:} - X_{j,:} \quad\text{for}\quad k = n j + i - \frac{j(j + 1)}{2}, 0\leq j \leq i < n^2`
			`\end{displaymath}`
			`With the $X_{-}$ matrix the above double sum can be formalized in matrix notation as follows\footnote{only valid cause $s_{i i} = 0$}`
			`\begin{displaymath}`
			`L_n(V) = -\frac{2}{nh^2} X_{-}^T(\svec(\sym(S)) \circ_r X_{-}) V`
			`\end{displaymath}`
			where $\circ_r$ means the ``recycled'' hadamard product, this is for a vector $x\in\mathbb{R}^n$ and a Matrix $M\in\mathbb{R}^{n\times m}$ just the element wise multiplication for each column of $M$ with $x$, or equivalent $x\circ_r M = \underbrace{(x, x, ..., x)}_{{n\times m}} \circ M$ where $\circ$ is the element-wise product.


			`\begin{lstlisting}[mathescape, language=PseudoCode]`
			`/* Starting value and initial gradient. */`
			`$V_1 \leftarrow V_{init}$ if $\exists V_{init}$ else $\rStiefel(p, q)$`
			$G_1 \leftarrow (1 - \mu) \nabla L_n(V_0)$

			`/* Optimization loop */`
			$t \leftarrow 1$
			`while $t\leq\,$max.iter do`

			`/* Update on stiefel manifold. */`
			$A \leftarrow G_tV_t^T - V_tG_t^T$
			$V_{t+1} \leftarrow (I_p + \tau A)^{-1}(I_p - \tau A)V_{t}$

			`/* Check break condition. */`
			`if $\\|V_{t+1}V_{t+1}^T - V_{t}^TV_{t}\\|_2^2 \leq \sqrt{2q}\,$tol then`
			`break`
			`end if`

			`/* Check for decrease. */`
			`if $L_n(V_{t+1}) - L_n(V_{t}) > L_n(V_{t})\,$slack then // TODO: slack?`
			`/* Reduce step-size. */`
			$\tau \leftarrow \gamma\tau$
			`else`
			`/* Gradient at next position (with momentum). */`
			$G_{t+1} \leftarrow \mu G_{t} + (1 - \mu) \nabla L_n(V_{t+1})$
			`/* Increase step index */`
			$t \leftarrow t + 1$
			`end if`

			`end while`
			`\end{lstlisting}`

wip: benchmarking, wip: doc 2019-10-18 07:06:36 +00:00
cleanup 2019-09-25 12:49:12 +00:00			`\end{document}`