NSSC/Exercise_01/src/main.cpp

/**
 * Implementation of an MPI_Parallel Stencil-Based Jacobi Solver for the 2D PDE
 *
 *      -Du(x, y) + k^2 u(x, y) = f(x, y),             (x, y) in [0, 1] x [0, 1]
 *
 * with the scalar k = 2 pi. The right hand side is
 *
 *      f(x, y) = k^2 sin(2 pi x) sinh(2 pi y)
 *
 * and the Dirichlet boundary conditions
 *
 *      u(0, y) = u(1, y) = u(x, 0) = 0,                          x, y in [0, 1]
 *      u(x, 1) = sin(2 pi x) sinh(2 pi),                            x in [0, 1]
 *
 * The following programm is implemented such that it can be compiled in seriel
 * or MPI-parallel form by declaring the `USE_MPI` macro (see `Makefile`).
 */
#define _USE_MATH_DEFINES   /* enables math constants from `cmath` */

#include <stddef.h>
#include <iostream>
#include <iomanip>
#include <sstream>
#include <functional>
#include <chrono>
#include <cmath>

#ifdef USE_MPI
    #include <mpi.h>
#endif

#include "Matrix.h"
#include "Solver.h"
#include "utils.h"

int main(int argn, char* argv[]) {

    /******************************* MPI Setup ********************************/
#ifdef USE_MPI
    // Initialize MPI
    MPI_Init(nullptr, nullptr);

    // Get MPI configure
    int mpi_size; /*< MPI pool size (a.k.a. total number of processes) */
    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);

    // Get MPI rank
    int mpi_world_rank; /*< MPI world rank (a.k.a. grid process ID) */
    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_world_rank);
#else
    int mpi_size = 1;
    int mpi_world_rank = 0;
#endif

    /**************************** Parse Arguments *****************************/
    if (0 < argn && argn < 4) {
        std::cerr << "usage: " << argv[0] << " <resolution> <iterations>" << std::endl;
        return -1;
    } else if (argn > 4) {
        std::cerr << "warning: " << "ignoring all but the first three params" << std::endl;
    }
#ifdef USE_MPI
    size_t dim;
    if (std::string(argv[1]) == "1D") {
        dim = 1;
    } else if (std::string(argv[1]) == "2D") {
        dim = 2;
    } else {
        std::cerr << "error: Parsing arg 1 <dim> failed" << std::endl;
        return -1;
    }
#endif
    size_t resolution;
    if (!(std::istringstream(argv[2]) >> resolution)) {
        std::cerr << "error: Parsing arg 2 <resolution> failed" << std::endl;
        return -1;
    }
    size_t iterations;
    if (!(std::istringstream(argv[3]) >> iterations)) {
        std::cerr << "error: Parsing arg 3 <iterations> failed" << std::endl;
        return -1;
    }
    if (resolution < 10 || resolution > 65536) {
        std::cerr << "error: arg 1 <resolution> " << resolution
            << " out of domain [10, 65536]" << std::endl;
        return -1;
    }
    if (iterations > 65536) {
        std::cerr << "error: arg 2 <iterations> " << iterations
            << "out of domain [0, 65536]" << std::endl;
        return -1;
    }

    // Report configuration
    if (mpi_world_rank == 0) {
        std::cout << std::setfill(' ')
#ifdef USE_MPI
            << "use MPI:         YES\n"
#else
            << "use MPI:          NO\n"
#endif
            << "nr. processes: " << std::setw(5) << mpi_size   << '\n'
            << "resolution:    " << std::setw(5) << resolution << '\n'
            << "iterations:    " << std::setw(5) << iterations << std::endl;
    }

    /************************* Start Time Measurement *************************/
    auto start = std::chrono::high_resolution_clock::now();

    /**************** Setup (local) PDE + Boundary Conditions *****************/
    const double k = M_PI;
    const double h = 1.0 / static_cast<double>(resolution - 1);

#ifdef USE_MPI
    // Group processes into a Cartesian communication topology. Set initial
    // values for a 1D grid.
    int mpi_dims[2] = {mpi_size, 1};
    // In case of a 2D grid, make equal partitions ob both axes (as equal as
    // possible. Note that `MPI_Dims_create` does not guarantee "as equal as".
    // For example it was observed that for 9 processes the generated grid
    // was a 9 x 1, the following computes a 3 x 3 decomposition).
    if (dim == 2) {
        two_factors(mpi_size, mpi_dims);
    }

    // Report grid topology
    std::cout << "topology:      " << dim << "D";
    if (dim == 2) {
        std::cout << " (" << mpi_dims[0] << " x " << mpi_dims[1] << ")";
    }
    std::cout << std::endl;

    // Setup a Cartesian topology communicator (NON-cyclic)
    const int mpi_periods[2] = {false, false};
    MPI_Comm mpi_comm_grid;
    MPI_Cart_create(
        MPI_COMM_WORLD,     // Old Communicator
        2,                  // number of dimensions
        mpi_dims,           // grid dimensions
        mpi_periods,        // grid periodicity
        true,               // allow process reordering
        &mpi_comm_grid
    );

    // Get MPI rank with respect to the grid communicator
    int mpi_grid_rank; /*< MPI grid rank (a.k.a. grid process ID) */
    MPI_Comm_rank(mpi_comm_grid, &mpi_grid_rank);

    // Get coordinates with respect to the grid communicator
    int mpi_coords[2];
    MPI_Cart_coords(mpi_comm_grid, mpi_grid_rank, 2, mpi_coords);

    // Get direct neighbors in the communication grid
    struct { int north; int east; int south; int west; } mpi_neighbors;
    // Get X-direction (dim 0) neighbors
    MPI_Cart_shift(
        mpi_comm_grid,              // grid communicator
        0,                          // axis index (0 <-> X)
        1,                          // offset
        &(mpi_neighbors.west),     // negated offset neighbor
        &(mpi_neighbors.east)      // offset neighbor
    );
    // Get Y-direction (dim 1) neighbors
    MPI_Cart_shift(
        mpi_comm_grid,
        1,                          // axis index (1 <-> Y)
        1,
        &(mpi_neighbors.south),
        &(mpi_neighbors.north)
    );

    // Calculate local (base) grid size (without ghost layers)
    size_t nx = partition(resolution, mpi_dims[0], mpi_coords[0]);
    size_t ny = partition(resolution, mpi_dims[1], mpi_coords[1]);
    // Add ghost layers for each (existing) neighbor
    ny += (mpi_neighbors.north != MPI_PROC_NULL);
    nx += (mpi_neighbors.east  != MPI_PROC_NULL);
    ny += (mpi_neighbors.south != MPI_PROC_NULL);
    nx += (mpi_neighbors.west  != MPI_PROC_NULL);

    // Compute local domain [xmin, xmax] x [ymin, ymax]
    double xmin = (mpi_neighbors.west  == MPI_PROC_NULL) ? 0.0
        : h * (partition_sum(resolution, mpi_dims[0], mpi_coords[0] - 1) - 1);
    double xmax = (mpi_neighbors.east  == MPI_PROC_NULL) ? 1.0
        : h * partition_sum(resolution, mpi_dims[0], mpi_coords[0]);
    double ymin = (mpi_neighbors.south == MPI_PROC_NULL) ? 0.0
        : h * (partition_sum(resolution, mpi_dims[1], mpi_coords[1] - 1) - 1);
    double ymax = (mpi_neighbors.north == MPI_PROC_NULL) ? 1.0
        : h * partition_sum(resolution, mpi_dims[1], mpi_coords[1]);

    // Create MPI vector Type (for boundary condition exchange)
    // Allows directly exchange of matrix rows (north/south bounds) since the
    // row elements are "sparse" in the sense that they are not directly aside
    // each other in memory (column major matrix layout) in contrast to columns.
    MPI_Datatype mpi_type_row;
    MPI_Type_vector(ny, 1, nx, MPI_DOUBLE, &mpi_type_row);
    MPI_Type_commit(&mpi_type_row);
#else
    // discretization grid resolution
    size_t nx = resolution, ny = resolution;
    // PDE domain borders [xmin, xmax] x [ymin, ymax] = [0, 1] x [0, 1]
    double xmin = 0.0, xmax = 1.0, ymin = 0.0, ymax = 1.0;
#endif

    // Declare right hand side function f(x, y) = k^2 sin(2 pi x) sinh(2 pi y)
    std::function<double(double, double)> fun = [k](double x, double y) {
        return k * k * sin(2 * M_PI * x) * sinh(2 * M_PI * y);
    };
    // Boundary conditions
    /** North boundary condition gN(x) = k^2 sin(2 pi x) sinh(2 pi) */
    std::function<double(double)> gN;
#ifdef USE_MPI
    // Check if local north boundary is part of the global north boundary
    if (mpi_neighbors.north == MPI_PROC_NULL) {
#endif
        // The local north boundary is equals the global north boundary
        gN = [k](double x) { return sin(2 * M_PI * x) * sinh(2 * M_PI); };
#ifdef USE_MPI
    } else {
        // local north boundary is zero like all the rest
        gN = [k](double) { return 0.0; };
    }
#endif
    /** East, South and West boundary conditions are all 0 */
    std::function<double(double)> g0 = [k](double) { return 0.0; };

    /******************************* Solve PDE ********************************/
    // Instantiate solver (local instance)
    Solver solver(nx, ny, xmin, xmax, ymin, ymax, h, k, fun, gN, g0, g0, g0);

    // Run solver iterations
    for (size_t iter = 0; iter < iterations; ++iter) {
        // Perform a single stencil Jacobi iteration
        solver.iterate();

#ifdef USE_MPI
        // Non-blocking send boundary conditions to all neighbors
        MPI_Request mpi_requests[4];
        int mpi_request_count = 0;

        if (mpi_neighbors.north != MPI_PROC_NULL) {
            auto bound = solver.read_boundary(Solver::Dir::North);
            MPI_Isend(bound.data(), bound.size(), MPI_DOUBLE,
                mpi_neighbors.north, iter, mpi_comm_grid,
                &mpi_requests[mpi_request_count++]);
        }
        if (mpi_neighbors.east != MPI_PROC_NULL) {
            auto bound = solver.read_boundary(Solver::Dir::East);
            MPI_Isend(bound.data(), 1, mpi_type_row,
                mpi_neighbors.east, iter, mpi_comm_grid,
                &mpi_requests[mpi_request_count++]);
        }
        if (mpi_neighbors.south != MPI_PROC_NULL) {
            auto bound = solver.read_boundary(Solver::Dir::South);
            MPI_Isend(bound.data(), bound.size(), MPI_DOUBLE,
                mpi_neighbors.south, iter, mpi_comm_grid,
                &mpi_requests[mpi_request_count++]);
        }
        if (mpi_neighbors.west != MPI_PROC_NULL) {
            auto bound = solver.read_boundary(Solver::Dir::West);
            MPI_Isend(bound.data(), 1, mpi_type_row,
                mpi_neighbors.west, iter, mpi_comm_grid,
                &mpi_requests[mpi_request_count++]);
        }

        // Wait for all send to complete before receiving new data
        // (just to be save)
        MPI_Waitall(mpi_request_count, mpi_requests, MPI_STATUSES_IGNORE);

        // Get new boundary conditions using a blocking receive
        MPI_Status mpi_status;
        if (mpi_neighbors.north != MPI_PROC_NULL) {
            auto bound = solver.write_boundary(Solver::Dir::North);
            MPI_Recv(bound.data(), bound.size(), MPI_DOUBLE,
                mpi_neighbors.north, iter, mpi_comm_grid, &mpi_status);
        }
        if (mpi_neighbors.east != MPI_PROC_NULL) {
            auto bound = solver.write_boundary(Solver::Dir::East);
            MPI_Recv(bound.data(), 1, mpi_type_row,
                mpi_neighbors.east, iter, mpi_comm_grid, &mpi_status);
        }
        if (mpi_neighbors.south != MPI_PROC_NULL) {
            auto bound = solver.write_boundary(Solver::Dir::South);
            MPI_Recv(bound.data(), bound.size(), MPI_DOUBLE,
                mpi_neighbors.south, iter, mpi_comm_grid, &mpi_status);
        }
        if (mpi_neighbors.west != MPI_PROC_NULL) {
            auto bound = solver.write_boundary(Solver::Dir::West);
            MPI_Recv(bound.data(), 1, mpi_type_row,
                mpi_neighbors.west, iter, mpi_comm_grid, &mpi_status);
        }
#endif
    }

    /***************************** Solution Stats *****************************/
    // Get solution
    Matrix<double>& solution = solver.solution();

    // Compute analytic (true) solution
    Matrix<double> analytic(solution.nrow(), solution.ncol());
    for (size_t j = 0; j < analytic.ncol(); ++j) {
        for (size_t i = 0; i < analytic.nrow(); ++i) {
            analytic(i, j) = sin(2 * M_PI * solver.X(i)) * sinh(2 * M_PI * solver.Y(j));
        }
    }

#ifdef USE_MPI
    // Frobenius Norm Accumulator
    double frob_norm_sendbuf[2] = {
        solver.resid_norm(), solution.dist(analytic, 1, 1, 1, 1)
    };
    // Square the norms (Accumulation of partial results is the square root of
    // the sum of the squared partial norms)
    frob_norm_sendbuf[0] = frob_norm_sendbuf[0] * frob_norm_sendbuf[0];
    frob_norm_sendbuf[1] = frob_norm_sendbuf[1] * frob_norm_sendbuf[1];
    // Maximum Norm Accumulator
    double max_norm_sendbuf[2] = {
        solver.resid_norm<Max>(), solution.dist<Max>(analytic, 1, 1, 1, 1)
    };

    // Global result reduction buffers
    double frob_norm_recvbuf[2], max_norm_recvbuf[2];

    // Accumulate global stats by sum/max reduction of local stats.
    MPI_Reduce(frob_norm_sendbuf, frob_norm_recvbuf, 2, MPI_DOUBLE, MPI_SUM,
        0, MPI_COMM_WORLD);
    MPI_Reduce(max_norm_sendbuf, max_norm_recvbuf, 2, MPI_DOUBLE, MPI_MAX,
        0, MPI_COMM_WORLD);

    // Finish by taking the square root of the accumulated global
    // squared frobenius norms
    frob_norm_recvbuf[0] = std::sqrt(frob_norm_recvbuf[0]);
    frob_norm_recvbuf[1] = std::sqrt(frob_norm_recvbuf[1]);
#else
    double frob_norm_recvbuf[2] = {
        solver.resid_norm(), solution.dist(analytic, 1, 1, 1, 1)
    };
    double  max_norm_recvbuf[2] =  {
        solver.resid_norm<Max>(), solution.dist<Max>(analytic, 1, 1, 1, 1)
    };
#endif

    // calculate run-time
    auto stop = std::chrono::high_resolution_clock::now();
    auto time = std::chrono::duration_cast<std::chrono::duration<double>>(stop - start)
        .count();

    // Report global stats (from "root" only)
    if (mpi_world_rank == 0) {
        std::cout
            <<   "Time [sec]:          " << std::setw(15) << time
            << "\n|| residuals ||_2:   " << std::setw(15) << frob_norm_recvbuf[0]
            << "\n|| residuals ||_inf: " << std::setw(15) <<  max_norm_recvbuf[0]
            << "\n|| error ||_2:       " << std::setw(15) << frob_norm_recvbuf[1]
            << "\n|| error ||_inf:     " << std::setw(15) <<  max_norm_recvbuf[1]
            << std::endl;
    }

    // MPI shutdown/cleanup
#ifdef USE_MPI
    MPI_Finalize();
#endif

    return 0;
}