From e2034c9ff605e29299c91082a6c440a0fd87d811 Mon Sep 17 00:00:00 2001 From: daniel Date: Tue, 17 Dec 2019 12:08:45 +0100 Subject: [PATCH] rm: notes --- LaTeX/notes.tex | 372 ------------------------------------------------ 1 file changed, 372 deletions(-) delete mode 100644 LaTeX/notes.tex diff --git a/LaTeX/notes.tex b/LaTeX/notes.tex deleted file mode 100644 index 6b4dac9..0000000 --- a/LaTeX/notes.tex +++ /dev/null @@ -1,372 +0,0 @@ -\documentclass[12pt,a4paper]{article} - -\usepackage[utf8]{inputenc} -\usepackage[T1]{fontenc} -\usepackage{amsmath, amsfonts, amssymb, amsthm} -\usepackage{tikz} -\usepackage{listings} -\usepackage{fullpage} - - -\lstdefinelanguage{PseudoCode} { - morekeywords={ - for, - while, - repeat, - from, - each, - foreach, - break, - continue, - in, - do, - as, - and, - or, - end, - return, - if, - then, - else, - function, - begin, - to, - new, - input, - output - }, - morecomment=[l]{/*}, - morecomment=[l]{//}, - % basicstyle=\ttfamily, - % keywordstyle=\color{blue}, %\ttfamily, - commentstyle=\color{gray}\it, - keywordstyle=\bf, - rulecolor=\color{black}, - literate=% - {!=}{{$\neq$}}1 - {<=}{{$\leq$}}1 - {>=}{{$\geq$}}1 - {->}{{$\rightarrow$}}1 - {<-}{{$\leftarrow$}}1 -} - -% }, -% tabsize=3, -% sensitive=false, -% morecomment=[l]{#}, -% morestring=[b]", -% extendedchars=true, -% inputencoding=utf8, -% literate=% -% {!=}{{$\neq$}}1 -% {<=}{{$\leq$}}1 -% {>=}{{$\geq$}}1 -% {<>}{{$\neq$}}1 -% {:=}{{$\ \leftarrow\quad$}}1 -% {Ö}{{\"O}}1 -% {Ä}{{\"A}}1 -% {Ü}{{\"U}}1 -% {ß}{{\ss{}}}1 -% {ü}{{\"u}}1 -% {ä}{{\"a}}1 -% {ö}{{\"o}}1 -% {~}{{\textasciitilde}}1, -% texcl=true % use all chars from \usepackage[utf8]{inputenc} -% } -\lstset{ - tabsize=4, - xleftmargin=0pt, % left margin - numbers=left, % linenumber position - numbersep=15pt, % left linenumber padding - numberstyle=\tiny, - basicstyle=\ttfamily, - keywordstyle=\color{black!60}, - commentstyle=\ttfamily\color{gray!70}, - breaklines=true, - literate= -} - -\renewcommand{\epsilon}{\varepsilon} - -\newcommand{\vecl}{\ensuremath{\operatorname{vec}_l}} -\newcommand{\Sym}{\ensuremath{\operatorname{Sym}}} - -\renewcommand{\vec}{\operatorname{vec}} -\newcommand{\devec}{\operatorname*{devec}} -\newcommand{\svec}{\operatorname{svec}} -\newcommand{\sym}{\operatorname{sym}} -\renewcommand{\skew}{\operatorname{skew}} -\newcommand{\rowSums}{\operatorname{rowSums}} -\newcommand{\colSums}{\operatorname{colSums}} -\newcommand{\diag}{\operatorname{diag}} - -\begin{document} - -\section{Kronecker Product Properties} -The \emph{mixed-product} property for matrices $A, B, C, D$ holds if and only if the following matrix products are well defined -\begin{displaymath} - (A\otimes B)(C \otimes D) = (A C) \otimes (B C). -\end{displaymath} -In combination with the \emph{Hadamard product} (element-wise multiplication) for matrices $A, C$ of the same size as well as $B, D$ of the same size is -\begin{displaymath} - (A\otimes B)\circ (C \otimes D) = (A \circ C) \otimes (B \circ D). -\end{displaymath} -The \emph{transpose} of the Kronecker product fulfills -\begin{displaymath} - (A\otimes B)^T = A^T \otimes B^T -\end{displaymath} - -\section{Distance Computation} -The pair-wise distances $d_V(X_{i,:}, X_{j,:})$ arranged in the distance matrix $D\in\mathbb{R}^{n\times n}$ can be written as -\begin{align*} - \vec(D) = \rowSums(((X Q)\otimes 1_n - 1_n \otimes (X Q))^2) -\end{align*} -This can be computed in $\mathcal{O}(n^2p + np^2)$ time (vectorization and devectorization takes $\mathcal{O}(1)$). - -The matrices $K, W$ are define through there elements as -\begin{displaymath} - k_{i j} = \exp\left(-\frac{d_{i j}^2}{2 h^2}\right),\qquad w_{i j} = \frac{k_{i j}}{\sum_{m} k_{m j}}. -\end{displaymath} - -Next are $\bar{y}^{(m)}$ and the ``element-wise'' loss $l_i = L_n(V, X_i)$. -\begin{displaymath} - \bar{y}^{(m)} = W^T Y^m,\qquad l = \bar{y}^{(2)} - (\bar{y}^{(1)})^2 -\end{displaymath} - -\section{Gradient Computation} -The model -\begin{displaymath} - Y \sim g(B^T X) + \epsilon. -\end{displaymath} - -Assume a data set $(X_i, Y_i)$ for $i = 1, ..., n$ with $X$ a $n\times p$ matrix such that each row represents one sample. Now let $l_i = L_n(V, X_i)$, $\bar{y}^{(1)}_j = (W^T Y)_j$ as well as $d_{i j}, w_{i j}$ the distance and weight matrix components. Then the gradient for the ``simple'' CVE method is given as -\begin{displaymath} - \nabla L_n(V) = \frac{1}{nh^2}\sum_{i = 1}^{n} \sum_{j = 1}^{n} (l_j - (Y_i - \bar{y}^{(1)}_j)^2) w_{i j} d_{i j} \nabla_V d_V(X_{i,:}, X_{j,:}). -\end{displaymath} -This representation is cumbersome and a direct implementation has a asymptotic run-time of $\Theta(n^2p^2)$ because it is a double sum over $n$, therefore quadratic in $n$, and the form of $\nabla_V d_V$. - -This can be optimized and written in matrix notation. First the distance gradient is given as -\begin{displaymath} - \nabla_V d_V(X_{i,:}, X_{j,:}) = -2 (X_{i,:} - X_{j,:})^T (X_{i,:} - X_{j,:}) V -\end{displaymath} -(Note: $X_{i,:}\in\mathbb{R}^{1\times p}$, aka a row representing one sample). In addition define the $n\times n$ matrix $S$ through its elements -\begin{displaymath} - s_{i j} = (l_j - (Y_i - \bar{y}^{(1)}_j)^2) w_{i j} d_{i j}. -\end{displaymath} -Substitution in the gradient leads to -\begin{align*} - \nabla L_n(V) - &= -\frac{2}{nh^2}\sum_{i = 1}^{n} \sum_{j = 1}^{n} s_{i j} (X_{i,:} - X_{j,:})^T (X_{i,:} - X_{j,:}) V \\ - &= -\frac{2}{nh^2}\sum_{i = 1}^{n} \sum_{j = 1}^{n} s_{i j} \left( X_{i,:}^T X_{i,:} - X_{i,:}^T X_{j,:} - X_{j,:}^T X_{i,:} + X_{j,:}^T X_{j,:} \right) V \\ - &= -\frac{2}{nh^2} \left( \sum_{i = 1}^{n}\sum_{j = 1}^{n} (s_{i j} + s_{j i}) X_{i,:}^T X_{i,:} - \sum_{i = 1}^{n}\sum_{j = 1}^{n} (s_{i j} + s_{j i}) X_{i,:}^T X_{j,:} \right) V \\ - &= -\frac{2}{nh^2} \left( X^T \diag(\colSums(S + S^T)) X - X^T (S + S^T) X \right) V \\ - &= -\frac{2}{nh^2} X^T \left( \diag(\colSums(S + S^T)) - (S + S^T) \right) X V -\end{align*} - -\begin{center}{\bf - ATTENTION: The given R examples are to illustrate the inplementation in C which is 0-indexed! -}\end{center} - -The \emph{vertorization} operation maps a matrix $A\in\mathbb{R}^{n\times m}$ into $\mathbb{R}^{nm}$ by stacking the columns of $A$; -\begin{displaymath} - \vec(A) = (a_{0,0}, a_{0,1}, a_{0,2},...,a_{0,n-1},a_{1,0},a_{1,1},...,a_{n-1,n-1})^T. -\end{displaymath} -The relation $\vec(A)_k = a_{i,j}$ holds for $k=nj+i$ such that $0\leq k < n^2$ and $0\leq i < n, 0 \leq j < m$. This operation is obviously a bijection. When going ``backwards'' the dimension of the original space is required, therefore let $\devec_n$ be the operation such that $\devec_n(\vec(A)) = A$ for $A\in\mathbb{R}^{n\times m}$.\footnote{Note that for $B\in\mathbb{R}^{p\times q}$ such that $pq = nm$ the $\devec_n(\vec(B))\in\mathbb{R}^{n\times m}$.} - -For symmetric matrices the information stored in $a_{i,j} = a_{j,i}$ is twice stored in $A=A^T\in\mathbb{R}^{n\times n}$, to remove this redundency the \emph{symmetric vectorization} is defined which saves the main diagonal and the lower triangular part of the symmetric matrix according the scema -\begin{displaymath} - \svec(A) = (a_{0,0},2a_{1,0},2a_{2,n},...,2a_{n-1,0},a_{1,1},2a_{2,1},...,2a_{n-1,1},a_{2,2},...,a_{n-1,n-1}) -\end{displaymath} -A it more formal -\begin{displaymath} - \svec(A)_{k} = (2-\delta_{i,j})a_{i,j} \quad\text{for}\quad k = n j + i - \frac{j(j + 1)}{2}, 0\leq j \leq i < n^2. -\end{displaymath} - -\begin{lstlisting}[language=R] -n <- 3 -k <- function(i, j, n) { (j * n) + i - (j * (j + 1) / 2) } -i <- function(n) { rep(1:n - 1, n) } -j <- function(n) { rep(1:n - 1, each = n) } -A <- matrix(k(i(n), j(n), n), n) -A[which(j(n) > i(n))] <- NA -A -# [,1] [,2] [,3] -# [1,] 0 NA NA -# [2,] 1 3 NA -# [3,] 2 4 5 -vec <- function(A) { as.vector(A) } -svec <- function(A) { - n <- nrow(A) - ((2 - (i(n) == j(n))) * A)[i(n) >= j(n)] -} -svec(matrix(1, n, n)) -# [1] 1 2 2 1 2 1 -devec <- function(vec, n) { matrix(vec, n) } -\end{lstlisting} - -For a quadratic matrix $A\in\mathbb{R}^{n\times n}$ we define -\begin{displaymath} - \sym(A) := \frac{A + A^T}{2}, \qquad \skew(A) := \frac{A - A^T}{2}. -\end{displaymath} - -% For a Matrix $A\in\mathbb{R}^{n\times n}$ the \emph{vectorization} operation is defined as a mapping from the matrices into a - -% Indexing a given matrix $A = (a_{ij})_{i,j = 1, ..., n} \in \mathbb{R}^{n\times n}$ given as -% \begin{displaymath} -% A = \begin{pmatrix} -% a_{0,0} & a_{0,1} & a_{0,2} & \ldots & a_{0,n-1} \\ -% a_{1,0} & a_{1,1} & a_{1,2} & \ldots & a_{1,n-1} \\ -% a_{2,0} & a_{2,1} & a_{2,2} & \ldots & a_{2,n-1} \\ -% \vdots & \vdots & \vdots & \ddots & \vdots \\ -% a_{n-1,0} & a_{n-1,1} & a_{n-1,2} & \ldots & a_{n-1,n-1} -% \end{pmatrix} -% \end{displaymath} - -% A symmetric matrix with zero main diagonal, meaning a matrix $S = S^T$ with $S_{i,i} = 0,\ \forall i = 1,..,n$ is given in the following form -% \begin{displaymath} -% S = \begin{pmatrix} -% 0 & s_{1,0} & s_{2,0} & \ldots & s_{n-1,0} \\ -% s_{1,0} & 0 & s_{2,1} & \ldots & s_{n-1,1} \\ -% s_{2,0} & s_{2,1} & 0 & \ldots & s_{n-1,2} \\ -% \vdots & \vdots & \vdots & \ddots & \vdots \\ -% s_{n-1,0} & s_{n-1,1} & s_{n-1,2} & \ldots & 0 -% \end{pmatrix} -% \end{displaymath} -% Therefore its sufficient to store only the lower triangular part, for memory efficiency and some further algorithmic shortcuts (sometime they are more expensive) the symmetric matrix $S$ is stored in packed form, meaning in a vector of the length $\frac{n(n-1)}{2}$. We use (like for matrices) a column-major order of elements and define the $\vecl:\Sym(n)\to \mathbb{R}^{n(n-1) / 2}$ operator defined as - -% \begin{displaymath} -% \vecl(S) = (s_{1,0}, s_{2,0},\cdots,s_{n-1,0},s_{2,1}\cdots,s_{n-1,n-2})^T -% \end{displaymath} - -% The relation between the matrix indices $i,j$ and the $\vecl$ index $k$ is given by - -% \begin{displaymath} -% (\vecl(S)_k = s_{i,j} \quad\Leftrightarrow\quad k = jn+i) : j \in \{0,...,n-2\} \land j < i < n. -% \end{displaymath} - -% \begin{center} -% \begin{tikzpicture}[xscale=1,yscale=-1] -% % \foreach \i in {0,...,5} { -% % \node at ({mod(\i, 3)}, {int(\i / 3)}) {$\i$}; -% % } -% \foreach \i in {1,...,4} { -% \foreach \j in {1,...,\i} { -% \node at (\j, \i) {$\i,\j$}; -% } -% } - -% \end{tikzpicture} -% \end{center} - -\newpage -\section{Algorithm} -The basic algorithm reads as follows: - -Mit -\begin{displaymath} - X_{diff} := X\otimes 1_n - 1_n\otimes X -\end{displaymath} -gilt -\begin{displaymath} - X_{diff}Q := (X\otimes 1_n - 1_n\otimes X)Q = XQ\otimes 1_n - 1_n\otimes XQ -\end{displaymath} - -\newcommand{\rStiefel}{\operatorname{rStiefel}} -% \lstset{language=PseudoCode} -% \begin{lstlisting}[mathescape, caption=Erste Phase von \texttt{HDE} (siehe \cite{HDE}), label=code:HDE, captionpos=b] -% \begin{lstlisting}[mathescape] -% // Hallo Welt -% /* Hallo comment */ -% $X_{diff} \leftarrow X\otimes 1_n - 1_n\otimes X$ - -% for attempt from 1 to attempts do -% if $\exists V_{init}$ then -% $V \leftarrow V_{init}$ -% else -% $V \leftarrow \rStiefel(p, q)$ -% end if - -% /* Projection matrix into null space */ -% $Q \leftarrow I_p - VV^T$ - -% /* Pair-wise distances (row sum of squared elements) */ -% $D \leftarrow$ foreach $i,j=1,...,n$ as $D_{i,j}\leftarrow \|(X_{i,:}-X_{j,:})Q\|_2^2$ - -% /* Weights */ -% $W \leftarrow$ foreach $i,j=1,...,n$ as $W_{i,j} \leftarrow \frac{k(D_{i,j})}{\sum_{i} k(D_{i,j})}$ - -% $\bar{y}_1 \leftarrow W^TY$ -% $\bar{y}_2 \leftarrow W^T(Y\odot Y)$ - -% /* Element-wise losses */ -% $L \leftarrow \bar{y}_2 - \bar{y}_1^2$ - -% for epoch from 1 to epochs do - -% $G_t \leftarrow \gamma G_{t-1} + (1-\gamma) \nabla_c L(V)$ - -% end for -% end for -% \end{lstlisting} - -The loss at a given position is -\begin{displaymath} - L_n(V) = \frac{1}{nh^2}\sum_{i = 0}^{n - 1} \sum_{j = 0}^{n - 1} (L_j - (Y_i - \bar{y}^{(1)}_j)^2) w_{i j} d_{i j} \nabla_V d_V(X_{i,:}, X_{j,:}) -\end{displaymath} -Now let the matrix $S$ be defined through its coefficients -\begin{displaymath} - s_{i j} = (L_j - (Y_i - \bar{y}^{(1)}_j)^2) w_{i j} d_{i j} -\end{displaymath} -This matrix is \underline{not} symmetric but we can consider the symmetric $S + S^T$ with a zero main diagonal because $D$ has a zero main diagonal, meaning $s_{i i} = 0$ because $d_{i i} = 0$ for each $i$. Therefore the following holds due to the fact that $\nabla_V d_V(X_{i,:}, X_{j,:}) = \nabla_V d_V(X_{j,:}, X_{i,:})$. -\begin{displaymath} - L_n(V) = \frac{1}{nh^2}\sum_{j = 0}^{n - 1} \sum_{i = j}^{n - 1} (s_{i j} + s_{j i}) \nabla_V d_V(X_{i,:}, X_{j,:}) -\end{displaymath} -Note the summation indices $0 \leq j \leq i < n$. Substitution with $\nabla_V d_V(X_{i,:}, X_{j,:}) = -2 (X_{i,:} - X_{j,:})^T(X_{i,:} - X_{j,:}) V$ evaluates to -\begin{displaymath} - L_n(V) = -\frac{2}{nh^2}\sum_{j = 0}^{n - 1} \sum_{i = j}^{n - 1} (s_{i j} + s_{j i}) (X_{i,:} - X_{j,:})^T(X_{i,:} - X_{j,:}) V -\end{displaymath} -Let $X_{-}$ be the matrix containing all pairs of $X_{i,:}$ to $X_{j,:}$ differences using the same row indexing scheme as the symmetric vectorization. -\begin{displaymath} - (X_{-})_{k,:} = X_{i,:} - X_{j,:} \quad\text{for}\quad k = n j + i - \frac{j(j + 1)}{2}, 0\leq j \leq i < n^2 -\end{displaymath} -With the $X_{-}$ matrix the above double sum can be formalized in matrix notation as follows\footnote{only valid cause $s_{i i} = 0$} -\begin{displaymath} - L_n(V) = -\frac{2}{nh^2} X_{-}^T(\svec(\sym(S)) \circ_r X_{-}) V -\end{displaymath} -where $\circ_r$ means the ``recycled'' hadamard product, this is for a vector $x\in\mathbb{R}^n$ and a Matrix $M\in\mathbb{R}^{n\times m}$ just the element wise multiplication for each column of $M$ with $x$, or equivalent $x\circ_r M = \underbrace{(x, x, ..., x)}_{{n\times m}} \circ M$ where $\circ$ is the element-wise product. - - -\begin{lstlisting}[mathescape, language=PseudoCode] -/* Starting value and initial gradient. */ -$V_1 \leftarrow V_{init}$ if $\exists V_{init}$ else $\rStiefel(p, q)$ -$G_1 \leftarrow (1 - \mu) \nabla L_n(V_0)$ - -/* Optimization loop */ -$t \leftarrow 1$ -while $t\leq\,$max.iter do - - /* Update on stiefel manifold. */ - $A \leftarrow G_tV_t^T - V_tG_t^T$ - $V_{t+1} \leftarrow (I_p + \tau A)^{-1}(I_p - \tau A)V_{t}$ - - /* Check break condition. */ - if $\|V_{t+1}V_{t+1}^T - V_{t}^TV_{t}\|_2^2 \leq \sqrt{2q}\,$tol then - break - end if - - /* Check for decrease. */ - if $L_n(V_{t+1}) - L_n(V_{t}) > L_n(V_{t})\,$slack then // TODO: slack? - /* Reduce step-size. */ - $\tau \leftarrow \gamma\tau$ - else - /* Gradient at next position (with momentum). */ - $G_{t+1} \leftarrow \mu G_{t} + (1 - \mu) \nabla L_n(V_{t+1})$ - /* Increase step index */ - $t \leftarrow t + 1$ - end if - -end while -\end{lstlisting} - - -\end{document} \ No newline at end of file