init

2022-03-11 10:26:56 +01:00 · 2022-03-11 10:26:56 +01:00 · 3bf63e12ba
commit 3bf63e12ba
13 changed files with 713 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,2 @@
+*.png filter=lfs diff=lfs merge=lfs -text
+*.svg filter=lfs diff=lfs merge=lfs -text
--- a/Exercise_01/Matrix.h
+++ b/Exercise_01/Matrix.h
@ -0,0 +1,110 @@
+#pragma once
+
+#include <stddef.h>
+#include <ostream>
+#include <vector>
+
+template <typename T>
+class MatrixView;
+
+template <typename T>
+class Matrix {
+public:
+    Matrix(size_t nrow, size_t ncol) : _nrow{nrow}, _ncol{ncol} {
+        _elem.reserve(nrow * ncol);
+    };
+
+    size_t nrow() const { return _nrow; };
+    size_t ncol() const { return _ncol; };
+    size_t size() const { return _nrow * _ncol; };
+
+          T& operator()(int i)       { return _elem[index(i)]; };
+    const T& operator()(int i) const { return _elem[index(i)]; };
+          T& operator()(int i, int j)       { return _elem[index(i, j)]; };
+    const T& operator()(int i, int j) const { return _elem[index(i, j)]; };
+
+private:
+    size_t _nrow;
+    size_t _ncol;
+    std::vector<T> _elem;
+
+    size_t index(int i) const {
+        int nelem = static_cast<int>(_nrow * _ncol);
+        while (i < 0)      { i += nelem; }
+        while (i >= nelem) { i -= nelem; }
+
+        return i;
+    };
+    size_t index(int i, int j) const {
+        int nrow = static_cast<int>(_nrow);
+        int ncol = static_cast<int>(_ncol);
+
+        while (i < 0)     { i += nrow; }
+        while (i >= nrow) { i -= nrow; }
+        while (j < 0)     { j += ncol; }
+        while (j >= ncol) { j -= ncol; }
+
+        return static_cast<size_t>(i + j * _nrow);
+    };
+};
+
+template <typename T>
+std::ostream& operator<<(std::ostream& out, const Matrix<T>& mat) {
+    for (size_t i = 0; i < mat.nrow(); ++i) {
+        for (size_t j = 0; j < mat.ncol(); ++j) {
+            out << mat(i, j) << ' ';
+        }
+        out << '\n';
+    }
+    return out;
+}
+
+template <typename T>
+class MatrixView {
+public:
+    MatrixView(Matrix<T>& matrix, size_t index, size_t stride, size_t nelem) :
+        _matrix(matrix), _index{index}, _stride{stride},
+        _nelem{nelem} { };
+
+    const size_t size() const { return _nelem; };
+
+          T& operator()(int i)       { return _matrix(_index + i * _stride); };
+    const T& operator()(int i) const { return _matrix(_index + i * _stride); };
+
+protected:
+    Matrix<T>& _matrix;
+    size_t _index;
+    size_t _stride;
+    size_t _nelem;
+};
+
+template <typename T>
+std::ostream& operator<<(std::ostream& out, const MatrixView<T>& view) {
+    for (size_t i = 0; i < view.size(); ++i) {
+        out << view(i) << ' ';
+    }
+
+    return out;
+}
+
+template <typename T>
+class Row : public MatrixView<T> {
+public:
+    Row(Matrix<T>& matrix, size_t index) :
+        MatrixView<T>(matrix, index * matrix.ncol(), 1, matrix.nrow()) { };
+};
+
+template <typename T>
+class Col : public MatrixView<T> {
+public:
+    Col(Matrix<T>& matrix, size_t index) :
+        MatrixView<T>(matrix, index, matrix.nrow(), matrix.ncol()) { };
+};
+
+template <typename T>
+class Diag : public MatrixView<T> {
+public:
+    Diag(Matrix<T>& matrix) :
+        MatrixView<T>(matrix, 0, matrix.nrow() + 1,
+            matrix.nrow() < matrix.ncol() ? matrix.nrow() : matrix.ncol()) { };
+};
--- a/Exercise_01/examples/MPI_hello_world.cpp
+++ b/Exercise_01/examples/MPI_hello_world.cpp
@ -0,0 +1,35 @@
+/**
+ * Compile
+ *  mpic++ MPI_hello_world.cpp -std=c++17 -Wall -pedantic -Wpedantic
+ * Usage
+ *  mpirun -n 4 ./a.out
+ * 
+ * Note: An easy way to finde the location of the `mpi.h` file is
+ *  mpic++ --showme:compile
+ * which might be usefull to configure the intelicense.
+ */
+
+#include <iostream>
+#include <mpi.h>
+
+int main(int argn, char* argv[]) {
+
+    // Initialize MPI (always required)
+    MPI_Init(nullptr, nullptr);
+
+    // Allocate MPI Settings
+    int num_proc;   /*< Number of processes */
+    int rank;       /*< This process rank (a.k.a. the MPI process ID) */
+    // Set/Get MPI Settings
+    MPI_Comm_size(MPI_COMM_WORLD, &num_proc);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    // Write MPI info to stdout
+    std::cout << "Hello World, i am rank " << rank << " of " << num_proc
+        << " processes." << std::endl;
+
+    // Shutdown MPI (always required)
+    MPI_Finalize();
+
+    return 0;
+}
--- a/Exercise_01/main.cpp
+++ b/Exercise_01/main.cpp
@ -0,0 +1,30 @@
+/**
+ * g++ main.cpp -std=c++17 -Wall -Wpedantic -pedantic -o main; ./main
+ */
+
+#include <iostream>
+
+#include "Matrix.h"
+
+int main(int argn, char* argv[]) {
+
+    Matrix<int> mat(3, 4);
+    for (size_t i = 0; i < mat.size(); ++i) {
+        mat(i) = i;
+    }
+
+    // std::cout << "Matrix:\n" << mat(2, 1) << std::endl;
+    std::cout << "Matrix:\n" << mat.nrow() << std::endl;
+
+    std::cout << "Matrix:\n" << mat << std::endl;
+
+    Col<int> colView(mat, 2);
+    Row<int> rowView(mat, 1);
+    Diag<int> diagView(mat);
+
+    std::cout << "col:\n" << colView << std::endl;
+    std::cout << "row:\n" << rowView << std::endl;
+    std::cout << "diag:\n" << diagView << std::endl;
+
+    return 0;
+}
--- a/Exercise_01_Assignment/README.md
+++ b/Exercise_01_Assignment/README.md
@ -0,0 +1,186 @@
+Lecture: 360.243 Numerical Simulation and Scientific Computing II, SS2022
+# Exercise 1
+
+**Handout:** Thursday, March 10
+
+**Submission:** Deadline for the group 
+submission via TUWEL is Thursday, March 31, end of day
+
+- Include the name of all actively collaborated group members in the submission documents.
+
+- The binaries should be callable with command line parameters as specified below.
+
+- Submit everything (Makefiles, Shell-Scripts, Sources, Binaries, PDF with Plots/Discussion) as one zip-file
+per task.
+
+- TUWEL-Course: https://tuwel.tuwien.ac.at/course/view.php?idnumber=360243-2022S
+
+**General information**
+
+- Use (and assume) the double precision floating point representation.
+
+- Test your MPI-parallel implementation on your local machine before you benchmark on the cluster.
+
+- Compare the results (i.e., residual and error) with your serial implementation to ensure a correct implementation.
+
+- Use only a small number of Jacobi iterations when benchmarking the performance your code: convergence is **not** required during
+benchmarking.
+
+***
+
+## MPI-Parallel Stencil-Based Jacobi Solver
+
+In this excercise, your task is to parallelize a stencil-based Jacobi solver for the 2D elliptic PDE 
+$$
+   -\Delta u(x,y) + k^2 u(x,y)  = f(x,y) \quad, \text{with} \ k=2\pi
+$$
+for the "unit square" domain 
+$$
+   \Omega = [0,1] \times [0,1]
+$$
+with the analytic solution
+$$
+   u_p(x,y) = \sin(2\pi x) \sinh(2\pi y)
+$$
+and right-hand-side
+$$
+   f(x,y) =  k^2 u_p(x,y)
+$$
+by implementing an MPI-based domain decomposition.
+The PDE is dicretized on a regular finite-difference grid  with fixed (Diriclet) boundary conditions:
+$$
+   \begin{align}
+      u(0,y) &= 0 \\
+      u(1,y) &= 0 \\
+      u(x,0) &= 0 \\
+      u(x,1) &= \sin(2\pi x)\sinh(2\pi) 
+   \end{align}
+$$
+The second-order derivates are discretized using a central difference scheme resulting in a "5-point star-shaped stencil".
+As a staring point for your implementation, you can use your own serial implementation of the Jacobi solver from the NSSC I exercises or use the source-code distributed with this exercise ([solver.hpp](solver.hpp)).
+
+## Domain Decomposition
+
+Your task is to decompose the finite-difference grid into domain regions such that multiple MPI-processes can independently perform an iteration on each region.
+The decoupling of the regions is achieved by introducing a *ghost layer* of grid points which surrounds each region.
+The values in the ghost layer of a region are not updated during an iteration.
+
+Instead, after an iteration is finished the updated values for the ghost layer are received from the neighbouring regions, and the boundary layer is sent to the neighouring regions (see Figure below).
+
+![Decomposition](images/unitsquare_decomposition_1D_2D.png)
+
+
+# Task 1: Questions (3 points)
+
+1. Describe the advantages/disadvantages of a two-dimensional decomposition (compared to a one-dimensional decomposition).
+2. Discuss if the decomposition of the domain changes the order of computations performed during a single Jacobi iteration (i.e., if you expect a numerically identical result after each iteration, or not).
+3. A generalization of the ghost layer approach would be to set the width of the ghost layer that is exchanged as a parameter `W` of the decomposition. This allows to perform `W` independent iterations before a communication of the ghost layers has to happen.
+ Comment in which situation (w.r.t the available bandwidth or latency between MPI-processes) multiple independent iterations are potentially advantageous.
+4. Assume a ghost layer with width `W=1` (this is what you will later implement) and discuss if a data exchange between parts of the domain which are "diagonal neighbors" is required (assuming a "5-point star-shaped stencil").
+5. How big is the sum of all L2 caches for 2 nodes of the IUE-cluster [link](https://www.iue.tuwien.ac.at/research/computing-infrastructure/) 
+
+
+# Task 2: One-Dimensional Decomposition (4 points)
+
+Your task is to implement a one-dimensional decomposition using a ghost layer and MPI-communication to update the ghost layers. 
+Create a program which is callable like this: 
+
+```bash
+mpirun -n NUMMPIPROC ./jacobiMPI resolution iterations
+# example call 
+mpirun -n 4 ./jacobiMPI 250 30
+```
+
+- `NUMMPIPROC`: number of MPI-processes to launch
+- `resolution`: number of grid points along each dimension of the unit square; the gridspacing is $`h = 1.0/(\text{resolution}-1)`$
+- `iterations`: number of Jacobi iterations to perform
+
+Further and more specifically, your program should
+
+- use $`\bar{u}_h=\mathbf{0}`$ as initial approximation to $`u`$, and (after finishing all iterations)
+- print the Euclidean $`\parallel \cdot \parallel_2`$ and Maximum $`\parallel \cdot \parallel_{\infty}`$ norm of the residual $`\parallel A_h\bar{u}_h-b_h \parallel`$ and of the total error $`\parallel \bar{u}_h-u_p \parallel`$ to the console, 
+- print the average runtime per iteration to the console, and
+- produce the same results as a serial run. 
+
+Finally, benchmark the parallel performance of your program `jacobiMPI` using 2 nodes of the IUE-Cluster for 4 different `resolution`s=$`\{125,250,1000,4000\}`$ using between 1 and 80 MPI-processes (`NUMMPIPROC`).
+More specifically, you should
+
+- create a plot of the parallel speedup and a plot of the parallel efficiency for each `resolution`, and
+- discuss your results in detail.
+
+**Notes:**
+
+- On your local machine, you can also launch MPI-runs using mpirun (after installing MPI, see `Makefile` for install commands on Ubuntu)
+- An example of a ”MPI”-Makefile and of a job submission script are provided with this exercise.
+- The use of `MPI_Cart_create`, `MPI_Cart_coords`, and `MPI_ Cart_shift` for setup of the communication paths is recommended.
+- Your implementation should work for any positive integer supplied for `NUMMPIPROC` (e.g., 1,2,3,4,...) and also utilize this number of processes for the decomposition.
+
+## Task 3: Two-Dimensional Decomposition (3 points)
+Extend your program from Task 2 by implementing a two-dimensional decomposition using a ghost layer and
+MPI-communication to update the ghost layers. Create a program which is callable like this:
+```bash
+mpirun -n NUMMPIPROC ./jacobiMPI DIM resolution iterations
+# example call
+mpirun -n 4 ./jacobiMPI 2D 125 200
+```
+
+- the command line parameters have the same meaning as above in Task 2
+- the new parameter `DIM` has two valid values `1D` or `2D` and switches between one-dimensional and two-dimensional decomposition.
+
+
+Ensure a correct implementation by comparing your results to a serial run. Benmarking on the cluster is **not**
+required.
+
+**Notes:**
+
+- Your implementation should work for any positive integer supplied for NUMMPIPROC (e.g., 1,2,3,4,...) and also utilize this number of processes for the 2D decomposition. 
+- If a the 2D composition is not possible with the supplied number of processes (i.e., a prime number), your program should resort to a 1D decomposition.
+
+***
+***
+
+# Working with the IUE-Cluster
+
+**Connecting**
+
+- Your login credentials will be provided via email.
+- You need to enable a "TU Wien VPN" connection.
+- You can login to the cluser using `ssh` and your credentials.
+- You will be asked to change your initial password upon first login.
+
+**File Transfer**
+
+- You can checkout this git-Repository once you are connected to the cluster.
+- You can transfer files and folders between your local machine and the cluster using `scp`
+- All nodes of the cluster operate on a shared file system (all your files on the cluster are also available when executing jobs)
+
+**Compiling on the cluster**
+
+- The cluster has a *login node* (the one you `ssh` to, details will be announced in the email with the credentials) 
+- This login node must only be used to compile your project and **never** to perform any benchmarks or MPI-runs (beside minimal lightweight tests of for the MPI configuration)
+- All other nodes of the cluster are used to run the "jobs" you submit.
+- To support cluster users, a set of *environement modules* (relevant for us is only the "MPI"-module) is made available. You can list all modules using `module avail`
+- Note that you also need to load the modules you require in your job subsmission scripts (see example provided in this repo).
+
+**Executing jobs on the cluster**
+
+- Once you successfully compiled your program on the login node of the cluster, you are read to submit jobs.
+- On our cluster job submissions are handled by the workload manager `SLURM` (a very common choice).
+- A job essentially is a shell-script which contains a call of your executable (see examples in this repo)- A additionally a job-script specifies its demands w.r.t. to resources, e.g.
+   - how many nodes are required?
+   - which runtime is anticipated?
+   - how many MPI-processes should be launched on each node?
+- After you submitted the job, it is up to the `SLURM` scheduler to queue it into the waiting list and to inform you once the job has completed.
+- The "results" of your job are
+   1. Potential output files which your program produced during execution.
+   2. The recording of the stdout and stderr which was recorded while your job executed (e.g., `slurm-12345.out`)
+
+**Useful Resources**
+
+SSH: https://phoenixnap.com/kb/ssh-to-connect-to-remote-server-linux-or-windows
+
+SCP: https://linuxize.com/post/how-to-use-scp-command-to-securely-transfer-files/
+
+SLURM: https://slurm.schedmd.com/quickstart.html
+
+Environment Modules: https://modules.readthedocs.io/en/latest/
--- a/Exercise_01_Assignment/images/unitsquare_decomposition_1D_2D.png
+++ b/Exercise_01_Assignment/images/unitsquare_decomposition_1D_2D.png
--- a/Exercise_01_Assignment/images/unitsquare_decomposition_1D_2D.svg
+++ b/Exercise_01_Assignment/images/unitsquare_decomposition_1D_2D.svg
--- a/Exercise_01_Assignment/src/Makefile
+++ b/Exercise_01_Assignment/src/Makefile
@ -0,0 +1,21 @@
+# requirements on ubuntu
+# sudo apt-get install build-essential
+# sudo apt-get install openmpi-bin openmpi-common openmpi-doc libopenmpi-dev
+
+# required modules on cluster
+# module load mpi/openmpi-x86_64
+# module load pmi/pmix-x86_64
+
+CXX=g++
+MPICXX?=mpic++
+CXXFLAGS := $(CXXFLAGS) -std=c++14 -O3 -Wall -pedantic -march=native -ffast-math
+
+jacobiSERIAL: Makefile main.cpp solver.hpp arguments.hpp
+	$(CXX) main.cpp -o jacobiSERIAL $(CXXFLAGS)
+
+jacobiMPI: Makefile main.cpp solver.hpp arguments.hpp
+	$(MPICXX) main.cpp -o jacobiMPI -lpthread -DUSEMPI $(CXXFLAGS)
+
+clean:
+	rm jacobiSERIAL jacobiMPI
+
--- a/Exercise_01_Assignment/src/arguments.hpp
+++ b/Exercise_01_Assignment/src/arguments.hpp
@ -0,0 +1,27 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+
+using namespace std;
+
+template<typename T>
+T convertTo(const int position, const T init, int argc, char *argv[]) {
+  if (argc <= position) {
+    std::cout
+        << "Conversion of argument " << position
+        << " to 'int' failed, not enough parameters, using default parameter: "
+        << init << std::endl;
+    return init;
+  }
+  T arg;
+  std::istringstream tmp(argv[position]);
+  tmp >> arg ? (std::cout << "Conversion of argument " << position
+                             << " to 'int' successfull: " << arg)
+                : (std::cout << "Conversion of argument " << position
+                             << " to 'int' failed");
+  std::cout << std::endl;
+  return arg;
+}
--- a/Exercise_01_Assignment/src/main.cpp
+++ b/Exercise_01_Assignment/src/main.cpp
@ -0,0 +1,39 @@
+#include <assert.h>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#ifdef USEMPI
+#include <mpi.h>
+#endif
+#include "arguments.hpp"
+#include "solver.hpp"
+ 
+int main(int argc, char *argv[]) {
+
+  int rank=0;
+  int numproc=1;
+#ifdef USEMPI
+  MPI_Init(NULL,NULL);
+  MPI_Comm_size(MPI_COMM_WORLD, &numproc);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#endif
+
+  // parse command line arguments
+  auto resolution = convertTo<int>(1, 32, argc, argv);
+  auto iterations = convertTo<int>(2, 1000, argc, argv);
+
+  std::cout << "numproc=" << numproc << std::endl;
+  std::cout << "resolution=" << resolution << std::endl;
+  std::cout << "iterations=" << iterations << std::endl;
+
+  assert(resolution > 0);
+  assert(iterations > 0);
+
+  solve(resolution, iterations,rank,numproc);
+
+#ifdef USEMPI
+  MPI_Finalize();
+#endif
+
+  return 0;
+}
--- a/Exercise_01_Assignment/src/run_mp_single.sh
+++ b/Exercise_01_Assignment/src/run_mp_single.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+#SBATCH -J jacobiMPI
+#SBATCH -N 1
+#SBATCH --ntasks-per-node=20
+#SBATCH --cpus-per-task=1
+#SBATCH --exclusive 
+#SBATCH --time=00:10:00
+if command -v sinfo  2>/dev/null # if on cluster
+then
+    module load mpi/openmpi-x86_64
+    module load pmi/pmix-x86_64
+    mpiproc=20
+else  # if on local machine
+    mpiproc=8
+fi
+
+mpirun -n $mpiproc ./jacobiMPI 128 1000
--- a/Exercise_01_Assignment/src/run_mpi_loops.sh
+++ b/Exercise_01_Assignment/src/run_mpi_loops.sh
@ -0,0 +1,31 @@
+#!/bin/bash
+#SBATCH -J jacobiMPI
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=40
+#SBATCH --cpus-per-task=1
+#SBATCH --exclusive 
+#SBATCH --time=01:00:00
+
+if command -v sinfo  2>/dev/null # if on cluster
+then
+    module load mpi/openmpi-x86_64
+    module load pmi/pmix-x86_64
+    mpiprocs=( 1 2 5 10 20 40 )
+    folder="datacluster"
+    mkdir -p $folder    
+else  # if on local machine
+    folder="datalocal"
+    mkdir -p $folder    
+    mpiprocs=( 1 2 )
+fi
+
+iterations=10
+resolutions=( 125 250 )
+
+for resolution in "${resolutions[@]}"
+do  
+    for procs in "${mpiprocs[@]}"
+    do  
+        mpirun -n $procs ./jacobiMPI $resolution $iterations |& tee "./${folder}/jacobiMPI_${resolution}_${iterations}_n_${procs}.log"
+    done
+done
--- a/Exercise_01_Assignment/src/solver.hpp
+++ b/Exercise_01_Assignment/src/solver.hpp
@ -0,0 +1,209 @@
+#pragma once
+
+#include <array>
+#include <chrono>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <vector>
+#ifdef USEMPI
+#include <mpi.h>
+#endif
+#include <assert.h>
+
+template <typename Type> class MatrixView {
+private:
+  std::vector<Type> &v;
+  MatrixView(const MatrixView &);
+  MatrixView &operator=(const MatrixView &);
+
+public:
+  const size_t N, M;
+  MatrixView(std::vector<Type> &v, size_t N, size_t M) : v(v), N(N), M(M) {
+    assert(v.size() / N == M);
+  }
+  Type &set(size_t i, size_t j) { return v[i + N * j]; }
+  const Type &get(size_t i, size_t j) { return v[i + N * j]; }
+  Type &set(size_t n) { return v[n]; }
+  const Type &get(size_t n) { return v[n]; }
+};
+
+double ParticularSolution(double x, double y) {
+  return sin(2 * M_PI * x) * sinh(2 * M_PI * y);
+}
+
+double NormL2(const std::vector<double> &v) {
+  double norm = 0;
+  for (const auto &value : v) {
+    norm += value * value;
+  }
+  return sqrt(norm);
+}
+
+double NormInf(const std::vector<double> &v) {
+  double max = std::numeric_limits<double>::lowest();
+  for (const auto &value : v) {
+    max = std::fabs(value) > max ? std::fabs(value) : max;
+  }
+  return max;
+}
+
+struct Stencil {
+  Stencil(double h)
+      : C(4.0 / (h * h) + 4 * M_PI * M_PI), N(-1.0 / (h * h)),
+        S(-1.0 / (h * h)), W(-1.0 / (h * h)), E(-1.0 / (h * h)) {}
+  const double C, N, S, W, E;
+};
+
+enum Cell { UNKNOWN = 0, DIR = 1, NEU = 2, ROB = 0 };
+
+void solve(size_t resolution, size_t iterations, int mpi_rank,
+           int mpi_numproc) {
+
+#ifdef USEMPI
+  std::cout << "Rank=" << mpi_rank << std::endl;
+  int ndims = 2;
+  int dims[2] = {};
+  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Dims_create(mpi_numproc, ndims, dims);
+  std::cout << "dims=(" << dims[0] << ", " << dims[1] << ")" << std::endl;
+  MPI_Comm topo_com;
+  int bcs[2] = {0, 0};
+  int reorder = 1;
+  MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, bcs, reorder, &topo_com);
+  int coords[2] = {};
+  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Cart_coords(topo_com, mpi_rank, ndims, coords);
+  std::cout << "coords=(" << coords[0] << ", " << coords[1] << ")" << std::endl;
+#endif
+
+  size_t NY = resolution;
+  size_t NX = resolution;
+  double h = 1.0 / (NY - 1);
+
+  const auto stencil = Stencil(h);
+
+  // domain cell types
+  std::vector<int> domain(NX * NY, Cell::UNKNOWN);
+  MatrixView<int> domainView(domain, NX, NY);
+  for (size_t i = 0; i != NX; ++i) {
+    domainView.set(i, 0) = Cell::DIR;
+    domainView.set(i, NY - 1) = Cell::DIR;
+  }
+  for (size_t j = 0; j != NY; ++j) {
+    domainView.set(0, j) = Cell::DIR;
+    domainView.set(NX - 1, j) = Cell::DIR;
+  }
+
+  // referenceSolution
+  std::vector<double> referenceSolution(NX * NY, 0);
+  MatrixView<double> referenceSolutionView(referenceSolution, NX, NY);
+  for (size_t j = 0; j != NY; ++j) {
+    for (size_t i = 0; i != NX; ++i) {
+      referenceSolutionView.set(i, j) = ParticularSolution(i * h, j * h);
+    }
+  }
+
+  // right hand side
+  std::vector<double> rightHandSide(NX * NY, 0);
+  MatrixView<double> rightHandSideView(rightHandSide, NX, NY);
+  for (size_t j = 0; j != NY; ++j) {
+    for (size_t i = 0; i != NX; ++i) {
+      rightHandSideView.set(i, j) =
+          ParticularSolution(i * h, j * h) * 4 * M_PI * M_PI;
+    }
+  }
+
+  auto SolverJacobi = [](std::vector<double> &sol, std::vector<double> &sol2,
+                         std::vector<double> &rhs, const Stencil &stencil,
+                         size_t NX, size_t NY) {
+    MatrixView<double> solView(sol, NX, NY);
+    MatrixView<double> sol2View(sol2, NX, NY);
+    MatrixView<double> rhsView(rhs, NX, NY);
+
+    for (size_t j = 1; j != NY - 1; ++j) {
+      for (size_t i = 1; i != NX - 1; ++i) {
+        sol2View.set(i, j) =
+            1.0 / stencil.C *
+            (rhsView.set(i, j) - (solView.get(i + 1, j) * stencil.E +           // rhsView.get !?
+                                  solView.get(i - 1, j) * stencil.W +
+                                  solView.get(i, j + 1) * stencil.S +
+                                  solView.get(i, j - 1) * stencil.N));
+      }
+    }
+    sol.swap(sol2);
+  };
+
+  auto ComputeResidual = [](std::vector<double> &sol, std::vector<double> &rhs,
+                            const Stencil &stencil, size_t NX, size_t NY) {
+    MatrixView<double> solView(sol, NX, NY);
+    MatrixView<double> rhsView(rhs, NX, NY);
+
+    std::vector<double> residual(NX * NY, 0);
+    MatrixView<double> residualView(residual, NX, NY);
+    for (size_t j = 1; j != NY - 1; ++j) {
+      for (size_t i = 1; i != NX - 1; ++i) {
+        residualView.set(i, j) =
+            rhsView.get(i, j) -
+            (solView.get(i, j) * stencil.C + solView.get(i + 1, j) * stencil.E +
+             solView.get(i - 1, j) * stencil.W +
+             solView.get(i, j - 1) * stencil.S +
+             solView.get(i, j + 1) * stencil.N);
+      }
+    }
+    return residual;
+  };
+  auto ComputeError = [](std::vector<double> &sol,
+                         std::vector<double> &reference, size_t NX, size_t NY) {
+    MatrixView<double> solView(sol, NX, NY);
+    MatrixView<double> referenceView(reference, NX, NY);
+
+    std::vector<double> error(NX * NY, 0);
+    MatrixView<double> errorView(error, NX, NY);
+
+    for (size_t j = 1; j != NY - 1; ++j) {
+      for (size_t i = 1; i != NX - 1; ++i) {
+        errorView.set(i, j) = referenceView.get(i, j) - solView.get(i, j);
+      }
+    }
+    return error;
+  };
+
+  // solution approximation starting with boundary initialized to dirichlet
+  // conditions, else 0
+  std::vector<double> solution(NX * NY, 0);
+  MatrixView<double> solutionView(solution, NX, NY);
+  for (size_t j = 0; j != NY; ++j) {
+    for (size_t i = 0; i != NX; ++i) {
+      if (domainView.get(i, j) == Cell::DIR)
+        solutionView.set(i, j) = ParticularSolution(i * h, j * h);
+    }
+  }
+  std::vector<double> solution2 = solution;
+  std::cout << "solve LSE using stencil jacobi" << std::endl;
+  auto start = std::chrono::high_resolution_clock::now();
+  for (size_t iter = 0; iter <= iterations; ++iter) {
+    SolverJacobi(solution, solution2, rightHandSide, stencil, NX, NY);
+  }
+
+  auto stop = std::chrono::high_resolution_clock::now();
+  auto seconds =
+      std::chrono::duration_cast<std::chrono::duration<double>>(stop - start)
+          .count();
+  std::cout << std::scientific << "runtime=" << seconds << std::endl;
+
+  {
+    auto residual = ComputeResidual(solution, rightHandSide, stencil, NX, NY);
+    auto residualNorm = NormL2(residual);
+    std::cout << std::scientific << "|residual|=" << residualNorm << std::endl;
+    auto residualMax = NormInf(residual);
+    std::cout << std::scientific << "|residualMax|=" << residualMax
+              << std::endl;
+    auto error = ComputeError(solution, referenceSolution, NX, NY);
+    auto errorNorm = NormL2(error);
+    std::cout << std::scientific << "|error|=" << errorNorm << std::endl;
+    auto errorMax = NormInf(error);
+    std::cout << std::scientific << "|errorMax|=" << errorMax << std::endl;
+  }
+}