tensor_predictors/mvbernoulli/src/ising_model.cpp

#include <Rcpp.h>           // R to C++ binding library
#include <cmath>            // `std::exp`
#include <iostream>
#include <iomanip>
#include <vector>
#include <algorithm>
#include <bitset>           // mainly for debugging reasons

#include "types.h"          // MVBinary (Multivariate Binary Data)
#include "bit_utils.h"      // uint32_t, ... and the `bit*` functions
#include "int_utils.h"      // isqrt, ilog, ...

#include "threadPool.h"

/******************************************************************************/
/***                              Ising model                               ***/
/******************************************************************************/
/**
 * The Ising model (as a special case of the Multi-Variate Bernoulli) has its
 * probability mass function (pmf) for a `p` dim. binary vector `y` defined as
 *
 *      P(Y = y | theta) = p_0(theta) exp(T(y)' theta)
 *
 * with the parameter vector `theta` and a statistic `T` of `y`. The real valued
 * parameter vector `theta` is of dimension `p (p + 1) / 2` and the statistic
 * `T` has the same dimensions as a binary vector given by
 *
 *      T(y) = vech(y y').
 *
 * TODO: continue
 */


// [[Rcpp::export(rng = false)]]
double ising_log_odds_sum(uint32_t y, const VechView& theta) {
    // Collects the result `T(theta)' y` (basically the sum of the log odds in
    // the parameter vector `theta` with a single or two way interaction in `y`
    double log_odds = 0.0;

    // Iterate over all bits in the event `y`
    for (; y; y &= y - 1) {
        // get LSB index
        int i = bitScanLS(y);
        // the single effect index in the parameter vector `theta`
        int base_index = (i * (2 * theta.dim() + 1 - i)) / 2;
        // add single effect log odds
        log_odds += theta[base_index];
        // For all (remaining) other effects add the two way interaction odds
        for (uint32_t b = y & (y - 1); b; b &= b - 1) {
            log_odds += theta[base_index + bitScanLS(b) - i];
        }

    }

    return log_odds;
}

//' Ising model scaling factor `p_0(theta)` for the Ising model
//'
// [[Rcpp::export(rng = false)]]
double ising_prob0(const VechView& theta) {
    // Value of the event `(1, ..., 1)`
    const uint32_t max_event = (static_cast<uint64_t>(1) << theta.dim()) - 1;
    // Accumulates `p_0(theta)^-1`
    double sum_0 = 1.0;

    // sum up all event (except `(0, .., 0)`, considured as initial value `1`)
    for (uint32_t a = 1; a <= max_event; ++a) {
        sum_0 += exp(ising_log_odds_sum(a, theta));
    }

    // scaling factor `p_0(theta) = sum_0^-1`
    return 1.0 / sum_0;
}

//' Ising model probabilities for every event, this returns a vector of size `2^p`
//' with indices corresponding to the events binary representation.
//'
//' Note: the R indexing leads to adding +1 to every index.
//'
// [[Rcpp::export(rng = false)]]
Rcpp::NumericVector ising_probs(const VechView& theta) {
    // setup probability vector
    Rcpp::NumericVector probs(1 << theta.dim());

    // Value of the event `(1, ..., 1)`
    const uint32_t max_event = (static_cast<uint64_t>(1) << theta.dim()) - 1;
    // Accumulates `p_0(theta)^-1`
    double sum_0 = 1.0;

    // set prob for the zero event to `1`, scaled later with all the other events
    probs[0] = 1.0;

    // sum up all event (except `(0, .., 0)`, considured as initial value `1`)
    for (uint32_t a = 1; a <= max_event; ++a) {
        // set and accumulate (unscaled) probabilites
        sum_0 += (probs[a] = exp(ising_log_odds_sum(a, theta)));
    }

    // finish scaling factor calculation `p_0(theta) = 1 / sum_0`
    double prob_0 = 1.0 / sum_0;

    // scale probabilites
    for (auto& prob : probs) {
        prob *= prob_0;
    }

    return probs;
}

//' Computes the zero conditioned probabilities
//'      P(Y_i = 1 | Y_-i = 0)
//' and
//'      P(Y_i = 1, Y_j = 1 | Y_-i = 0)
//' from the natural parameters `theta` with matching indexing.
//'
//' This is the inverse function of `ising_theta_from_cond_prob`
//'
// [[Rcpp::export(rng = false)]]
Rcpp::NumericVector ising_cond_probs(const VechView& theta) {
    // setup result of the same size as theta
    Rcpp::NumericVector pi(theta.size());

    // set random variable dimension
    std::size_t p = theta.dim();

    // iterate all single effects
    for (std::size_t i = 0; i < p; ++i) {
        // compute probs for single effects
        std::size_t base_index = (i * (2 * p + 1 - i)) / 2;
        double exp_i = exp(theta[base_index]);
        // the probability `P(Y_i = 1 | Y_-i = 0)`
        pi[base_index] = exp_i / (1 + exp_i);
        // iterate over bigger indexed components interacting with the `i`th one
        for (std::size_t j = i + 1; j < p; ++j) {
            // again compute exp(theta_j)
            double exp_j = exp(theta[(j * (2 * p + 1 - j)) / 2]);
            // as well as the two way exponent
            double exp_ij = exp(theta[base_index + (j - i)]);
            // two way consitional probability `P(Y_i = Y_j = 1 | Y_-i,-j = 0)`
            pi[base_index + (j - i)] = (exp_i * exp_j * exp_ij)
                / (1.0 + exp_i + exp_j + exp_i * exp_j * exp_ij);
        }
    }

    return pi;
}

//' Computes the expectation of `Y` under the Ising model with natural parameter
//' `theta` given component wise by
//'
//'      E Y_i = P(Y_i = 1)
//'
// [[Rcpp::export(rng = false)]]
Rcpp::NumericVector ising_expectation(const VechView& theta) {
    const std::size_t p = theta.dim();  // dim of `Y`
    const uint32_t max_event = static_cast<uint32_t>(-1) >> (32 - p);
    double sum_0 = 1.0;                 // accumulates `p_0(theta)^-1`
    Rcpp::NumericVector mu(p, 0.0);     // `mu = E Y`

    // iterate all 2^p events (except the zero event)
    for (uint32_t y = 1; y <= max_event; ++y) {
        // dot product `vech(y y')' theta` for current event
        double dot = 0;
        // iterate all bits in `y`
        for (uint32_t a = y; a; a &= a - 1) {
            int i = bitScanLS(a);
            int base_index = (i * (2 * p + 1 - i)) / 2;
            // add single effects of `y`
            dot += theta[base_index];
            // iterate over all (higher indexed) interactions and add then
            for (uint32_t b = a & (a - 1); b; b &= b - 1) {
                dot += theta[base_index + bitScanLS(b) - i];
            }
        }
        // compute (unscaled) event `y` probability `exp(T(y)' theta)`
        double prob_y = exp(dot);
        sum_0 += prob_y;

        // add current (unscalled) probability to all components of set bits
        for (uint32_t a = y; a; a &= a - 1) {
            mu[bitScanLS(a)] += prob_y;
        }
    }

    // finalize `E[Y]` by scaling with `p_0 = sum_0^-1`
    double prob_0 = 1.0 / sum_0;
    for (auto& mui : mu) {
        mui *= prob_0;
    }

    return mu;
}

//' Computes the covariance (second centered moment) of `Y` under the Ising model
//' with natural parameter `theta`.
//'
//'      cov(Y, Y) = E[Y Y'] - E[Y] E[Y]'
//'
// [[Rcpp::export(rng = false)]]
Rcpp::NumericMatrix ising_cov(const VechView& theta) {
    const std::size_t p = theta.dim();  // dim of `Y`
    const uint32_t max_event = static_cast<uint32_t>(-1) >> (32 - p);
    double sum_0 = 1.0;                 // accumulates `p_0(theta)^-1`
    Rcpp::NumericMatrix cov(p, p);

    // iterate over all 2^p events (except the zero event)
    for (std::size_t y = 1; y <= max_event; ++y) {
        // dot product `vech(y y')' theta` for current event
        double dot = 0;
        // iterate all bits in `y`
        for (uint32_t a = y; a; a &= a - 1) {
            int i = bitScanLS(a);
            int base_index = (i * (2 * p + 1 - i)) / 2;
            // add single effects of `y`
            dot += theta[base_index];
            // iterate over all (higher indexed) interactions and add then
            for (uint32_t b = a & (a - 1); b; b &= b - 1) {
                dot += theta[base_index + bitScanLS(b) - i];
            }
        }
        // compute (unscaled) event `y` probability `exp(T(y)' theta)`
        double prob_y = exp(dot);
        sum_0 += prob_y;

        // sum E[Y, Y'] but still unscaled
        for (uint32_t a = y; a; a &= a - 1) {
            int i = bitScanLS(a);
            for (uint32_t b = y; b; b &= b - 1) {
                int j = bitScanLS(b);
                cov[i * p + j] += prob_y;
            }
        }
    }

    // finish computing `E[Y Y']` by scaling with `p_0 = sum_0^-1`
    double prob_0 = 1.0 / sum_0;
    for (auto& covij : cov) {
        covij *= prob_0;
    }

    // subtract outer product of expectation `-= E[Y] E[Y]'`
    const auto mu = ising_expectation(theta); // `mu = E[Y]`
    for (std::size_t j = 0; j < p; ++j) {
        for (std::size_t i = 0; i < p; ++i) {
            cov[j * p + i] -= mu[i] * mu[j];
        }
    }

    return cov;
}

//' Computes the single and two way effect marginal probabilities
//'
//'      P(Y_i = 1)
//' and
//'      P(Y_i Y_j = 1)
//'
//' In its vectorized form this function computes E[vech(Y Y')]
//'
// [[Rcpp::export(rng = false)]]
Rcpp::NumericVector ising_marginal_probs(const VechView& theta) {
    // Step 0: Setup (and validate) variables/parameters
    const std::size_t p = theta.dim();
    if (p != theta.dim()) {
        Rcpp::stop("Parameter dimension does not match data dimension");
    }
    const uint32_t max_event = static_cast<uint32_t>(-1) >> (32 - p);
    double sum_0 = 1.0;  // accumulates p_0(theta)^-1
    Rcpp::NumericVector score(theta.size(), 0.0); // grad l(theta)

    // Step 1: Calc `-n E[vech(Y Y')]` where the sum over `y` is the sum over
    // all possible events `y in {0, 1}^p`.
    for (uint32_t y = 1; y <= max_event; ++y) {
        // sum of `T(y)' theta` for current instance `y`
        double log_odds = 0;
        // iterate all bits in `y`
        for (uint32_t a = y; a; a &= a - 1) {
            int i = bitScanLS(a);
            int base_index = (i * (2 * p + 1 - i)) / 2;
            // add single effects of `y`
            log_odds += theta[base_index];
            // iterate over all (higher indexed) interactions and add then
            for (uint32_t b = a & (a - 1); b; b &= b - 1) {
                log_odds += theta[base_index + bitScanLS(b) - i];
            }
        }
        // compute (unscaled) event `y` probability `exp(T(y)' theta)`
        double prob_y = exp(log_odds);
        sum_0 += prob_y;

        // at this point we know the (unscaled) probability of the event `y`
        // which needs to be added to the gradient at the set bit positions
        // of `y` corresponding to single and interaction effects
        for (uint32_t a = y; a; a &= a - 1) {
            int i = bitScanLS(a);
            int base_index = (i * (2 * p + 1 - i)) / 2;
            score[base_index] += prob_y;
            for (uint32_t b = a & (a - 1); b; b &= b - 1) {
                score[base_index + bitScanLS(b) - i] += prob_y;
            }
        }
    }
    // finalize `-E[vech(Y Y')]` by scaling with `-p_0 = -sum_0^-1`
    double prob_0 = 1.0 / sum_0;
    for (auto& s : score) {
        s *= prob_0;
    }

    return score;
}

//' Natural parameters from the sufficient conditional probability statistic `pi`
//'
//' Computes the natural parameters `theta` of the Ising model from zero
//' conditioned probabilites for single and two way effects.
//'
//' This is the inverse function of `ising_cond_prob_from_theta`
//'
// [[Rcpp::export(rng = false)]]
Rcpp::NumericVector ising_theta_from_cond_prob(const VechView& pi) {
    // initialize natural parameters vector theta
    Rcpp::NumericVector theta(pi.size());

    // check if given probabilities are in the ragne [0, 1]
    if (std::any_of(pi.begin(), pi.end(), [](const double prob) {
        return (prob < 0.0) || (1.0 < prob);
    })) {
        Rcpp::stop("`pi` must contain only elements in the range [0, 1]");
    }

    // get random variable dimension
    std::size_t p = pi.dim();

    // iterate all single effects
    for (std::size_t i = 0; i < p; ++i) {
        // compute single effect theta_i
        std::size_t base_index = (i * (2 * p + 1 - i)) / 2;
        theta[base_index] = log(pi[base_index] / (1.0 - pi[base_index]));
        // iterate all higher indexed interactions with the currecnt effect
        for (std::size_t j = i + 1; j < p; ++j) {
            theta[base_index + (j - i)] = log(
                ((1 - pi(i) * pi(j)) * pi(i, j)) / (pi(i) * pi(j) * (1 - pi(i, j)))
            );
        }
    }

    return theta;
}

//' Computes the log-likelihood at natural parameters `theta` of the Ising model
//' given a set of observations `Y`
//'
//'      l(theta) = log(p_0(theta)) + n^-1 sum_i vech(y_i y_i')' theta
//'
// [[Rcpp::export(rng = false)]]
double ising_log_likelihood(const VechView& theta, const MVBinary& Y) {
    // sum the log odds `sum_i vech(y_i y_i')' theta`
    double sum = 0.0;
    for (const uint32_t y : Y) {
        sum += ising_log_odds_sum(y, theta);
    }

    // add scaling factor `log(p_0(theta))`
    return log(ising_prob0(theta)) + (sum / static_cast<double>(Y.size()));
}

//' Computes the Score of the Ising model, this is the gradiend of the (mean)
//' log-likelihood with respect to the natural parameters
//'
//'      grad l(theta) = -E[vech(Y Y')] + n^-1 sum_i vech(y_i y_i')
//'
// [[Rcpp::export(rng = false)]]
Rcpp::NumericVector ising_score(const VechView& theta, const MVBinary& Y) {
    const std::size_t p = theta.dim();

    // Step 1: compute -E[vech(Y Y')] (data independent part)
    auto score = ising_marginal_probs(theta);
    for (auto& s : score) {
        s *= -1.0;
    }

    // Step 2: Add data dependend part `mean_i vech(y_i y_i')`
    const double n_inv = 1.0 / static_cast<double>(Y.size());
    for (const uint32_t y : Y) {
        // start by iterating the single effects in `y` (LSB)
        for (uint32_t a = y; a; a &= a - 1) {
            int i = bitScanLS(a);
            int base_index = (i * (2 * p + 1 - i)) / 2;
            // add single effects
            score[base_index] += n_inv;
            // and the two way interaction effects
            for (uint32_t b = a & (a - 1); b; b &= b - 1) {
                score[base_index + bitScanLS(b) - i] += n_inv;
            }
        }
    }

    return score;
}

/**
 * Overload of `ising_score` for a single observation `y`
 */
Rcpp::NumericVector ising_score(const VechView& theta,
    const uint32_t y, const std::size_t p
) {
    // Step 1: compute -E[vech(Y Y')] (data independent part)
    auto score = ising_marginal_probs(theta);
    for (auto& s : score) {
        s *= -1.0;
    }

    // Step 2: Add data dependend part `vech(y y')`
    // start by iterating the single effects in `y` (LSB)
    for (uint32_t a = y; a; a &= a - 1) {
        int i = bitScanLS(a);
        int base_index = (i * (2 * p + 1 - i)) / 2;
        // add single effects
        score[base_index] += 1.0;
        // and the two way interaction effects
        for (uint32_t b = a & (a - 1); b; b &= b - 1) {
            score[base_index + bitScanLS(b) - i] += 1.0;
        }
    }

    return score;
}

/**
 * Computes the log-likelihood at natural parameters `theta` of the Ising model
 * given a set of observations `Y`
 *
 *      l(alpha) = n^-1 sum_i (log(p_0(alpha, x_i)) + (x_i' alpha y_i)^2)
 *               = n^-1 sum_i (log(p_0(theta_alpha(x_i))) + vech(y_i y_i')'theta_alpha(x_i))
 */
// [[Rcpp::export(rng = false)]]
double ising_conditional_log_likelihood(const Rcpp::NumericMatrix& alpha,
    const Rcpp::NumericMatrix& X, const MVBinary& Y
) {
    // get probem dimensions
    const std::size_t p = alpha.nrow();
    const std::size_t q = alpha.ncol();

    // check parameter dimensions
    if (X.nrow() != Y.size() || X.ncol() != p || Y.dim() != q) {
        Rcpp::stop("Parameter dimension missmatch");
    }

    // natural parameter for the conditional Ising model
    //      `theta_alpha(x) = vech((2 1_q 1_q' - I_q) o Theta_alpha(x))`
    // with `o` denoting the Hadamart product.
    VechView theta(q);
    // temp inbetween variables
    Rcpp::NumericVector z(q);

    // sum over all observations
    double ll = 0.0;
    for (std::size_t i = 0; i < Y.size(); ++i) {
        // get i'th observation (response `y` and predictor `x`)
        const uint32_t y = Y[i];
        const auto x = X.row(i);

        // compute natural parameter from `alpha` with current predictor `x`
        // First `z = alpha' x_i` and `s = y_i' alpha' x_i`
        double s = 0.0;
        for (std::size_t j = 0; j < q; ++j) {
            z[j] = 0.0;
            for (std::size_t k = 0; k < p; ++k) {
                z[j] += x[k] * alpha[j * p + k];
            }
            s += static_cast<bool>(y & (1 << j)) * z[j];
        }
        // and then `vech` of the outer product `z z'` with off diagonal
        // elements multiplied by `2`. (See: `Theta` to `theta` relation)
        // Explicitly (`theta = theta_alpha(x)`):
        //      `theta = vech((2 1_q 1_q' - I_q) o (alpha' x x' alpha))`
        // where `o` is the Hadamard pdoruct.
        for (std::size_t j = 0; j < q; ++j) {
            theta(j, j) = z[j] * z[j];
            for (std::size_t k = j + 1; k < q; ++k) {
                theta(j, k) = 2.0 * z[j] * z[k];
            }
        }

        // add to log-lilelihood sum
        ll += log(ising_prob0(theta)) + s * s;
    }

    return ll / static_cast<double>(Y.size());
}

/**
 * Comutes the Score for the conditional Ising model
 *
 *      P(Y = y | X = x) = p_0(alpha) exp(y' Theta_alpha(x) y)
 *
 * with the parameter relation
 *
 *      Theta_alpha(x) = alpha' x x' alpha.
 *
 * The computed Score has the form
 *
 *      grad l(alpha) =
 *          2 n^-1 sum_i x_i x_i' alpha (-E[vec(Y Y') | X = x_i] + vec(y_i y_i'))
 */
// [[Rcpp::export(rng = false)]]
Rcpp::NumericVector ising_conditional_score(const Rcpp::NumericMatrix& alpha,
    const Rcpp::NumericMatrix& X, const MVBinary& Y
) {
    // get probem dimensions
    const std::size_t p = alpha.nrow();
    const std::size_t q = alpha.ncol();

    // check parameter dimensions
    if (X.nrow() != Y.size() || X.ncol() != p || Y.dim() != q) {
        Rcpp::stop("Parameter dimension missmatch");
    }

    // setup the Score (default zero initialized)
    Rcpp::NumericMatrix score(p, q);
    // natural parameter for the conditional Ising model
    //      `theta_alpha(x) = vech((2 1_q 1_q' - I_q) o Theta_alpha(x))`
    // with `o` denoting the Hadamart product.
    VechView theta(q);
    // temp inbetween variables
    Rcpp::NumericVector z(q);

    // for each observation (iter observation index i = 0, ..., n - 1)
    for (std::size_t i = 0; i < Y.size(); ++i) {
        // get i'th observation (response `y` and predictor `x`)
        const uint32_t y = Y[i];
        const auto x = X.row(i);

        // compute natural parameter from `alpha` with current predictor `x`
        // First `z = alpha' x`
        for (std::size_t j = 0; j < q; ++j) {
            z[j] = 0.0;
            for (std::size_t k = 0; k < p; ++k) {
                z[j] += x[k] * alpha[j * p + k];
            }
        }
        // and then `vech` of the outer product `z z'` with off diagonal
        // elements multiplied by `2`. (See: `Theta` to `theta` relation)
        // Explicitly (`theta = theta_alpha(x)`):
        //      `theta = vech((2 1_q 1_q' - I_q) o (alpha' x x' alpha))`
        // where `o` is the Hadamard pdoruct.
        for (std::size_t j = 0; j < q; ++j) {
            theta(j, j) = z[j] * z[j];
            for (std::size_t k = j + 1; k < q; ++k) {
                theta(j, k) = 2.0 * z[j] * z[k];
            }
        }

        // With `theta_alpha(x)` compute the classic Ising model Score at
        // `theta = theta_alpha(x)` for a single observation `y`.
        //      `S = grad_theta l`
        const auto S = ising_score(theta, y, q);

        // convert classic ising model Score to conditional Ising model Score
        // by using:
        //      `grad_alpha l = 2 x x' alpha vec^-1(D_q grad_theta l)`
        // where `D_q` is the duplication matrix.
        // For each column of `grad_alpha l`
        for (std::size_t k = 0; k < q; ++k) {
            // compute the `k`th vector matrix product `(z' S)_k` component
            double zS_k = 0.0;
            for (std::size_t l = 0; l < q; ++l) {
                zS_k += z[l] * S[theta.index(std::min(k, l), std::max(k, l))];
            }
            // and now add the `k`th column `x (z' S)_k` to the Score
            for (std::size_t j = 0; j < p; ++j) {
                score[k * p + j] += x[j] * zS_k;
            }
        }
    }

    return (2.0 / static_cast<double>(X.nrow())) * score;
}

// TODO: Drop or rewrite to test/fix issue with R/Rcpp multithreading issue,
//       Apparently I can NOT use Rcpp in conjunction with multiple threads
//       as the garbage collector stack gets out of sync.
// SEE: https://stackoverflow.com/questions/42119609/how-to-handle-mismatching-in-calling-convention-in-r-and-c-using-rcpp
//
// NOTE: The below algorithm implements the _old_ `theta` to `eta` relation
//
// // [[Rcpp::export(rng = false)]]
// Rcpp::NumericVector ising_conditional_score_mt(const Rcpp::NumericMatrix& alpha,
//     const Rcpp::NumericMatrix& X, const MVBinary& Y
// ) {
//     // get probem dimensions
//     const std::size_t p = alpha.nrow();
//     const std::size_t q = alpha.ncol();

//     // check parameter dimensions
//     if (X.nrow() != Y.size() || X.ncol() != p || Y.dim() != q) {
//         Rcpp::stop("Parameter dimension missmatch");
//     }

//     // setup the Score (default zero initialized)
//     Rcpp::NumericMatrix score(p, q);

//     // setup a thread pool to perform per observation computations in parallel
//     ThreadPool pool;

//     // for each observation setup a task to compute the Score this observation
//     // Score and then add them up
//     for (std::size_t i = 0; i < Y.size(); ++i) {
//         // push task to the thread pool, they are executed as soon as possible
//         pool.push([&alpha, &X, &Y, i, p, q, /*out*/ &score]() {

//             // get i'th observation (response `y` and predictor `x`)
//             const uint32_t y = Y[i];
//             const auto x = X.row(i);

//             // natural parameter for the conditional Ising model
//             //      `theta_alpha(x) = vech((2 1_q 1_q' - I_q) o Theta_alpha(x))`
//             // with `o` denoting the Hadamart product.
//             VechView theta(q);
//             // temp inbetween variables
//             Rcpp::NumericVector z(q);

//             // compute natural parameter from `alpha` with current predictor `x`
//             // First `z = alpha' x`
//             for (std::size_t j = 0; j < q; ++j) {
//                 z[j] = 0.0;
//                 for (std::size_t k = 0; k < p; ++k) {
//                     z[j] += x[k] * alpha[j * p + k];
//                 }
//             }
//             // and then `vech` of the outer product `z z'` with off diagonal
//             // elements multiplied by `2`. (See: `Theta` to `theta` relation)
//             // Explicitly (`theta = theta_alpha(x)`):
//             //      `theta = vech((2 1_q 1_q' - I_q) o (alpha' x x' alpha))`
//             // where `o` is the Hadamard pdoruct.
//             for (std::size_t j = 0; j < q; ++j) {
//                 theta(j, j) = z[j] * z[j];
//                 for (std::size_t k = j + 1; k < q; ++k) {
//                     theta(j, k) = 2.0 * z[j] * z[k];
//                 }
//             }

//             // With `theta_alpha(x)` compute the classic Ising model Score at
//             // `theta = theta_alpha(x)` for a single observation `y`.
//             //      `S = grad_theta l`
//             const auto S = ising_score(theta, y, q);

//             // convert classic ising model Score to conditional Ising model Score
//             // by using:
//             //      `grad_alpha l = 2 x x' alpha vec^-1(D_q grad_theta l)`
//             // where `D_q` is the duplication matrix.
//             // For each column of `grad_alpha l`
//             for (std::size_t k = 0; k < q; ++k) {
//                 // compute the `k`th vector matrix product `(z' S)_k` component
//                 double zS_k = 0.0;
//                 for (std::size_t l = 0; l < q; ++l) {
//                     zS_k += z[l] * S[theta.index(std::min(k, l), std::max(k, l))];
//                 }
//                 // and now add the `k`th column `x (z' S)_k` to the Score
//                 // Add to output score, `+=` should be atomic (TODO: check this!!!)
//                 for (std::size_t j = 0; j < p; ++j) {
//                     score[k * p + j] += x[j] * zS_k;
//                 }
//             }
//         });
//     }

//     return (2.0 / static_cast<double>(X.nrow())) * score;
// }


//' Fisher information for the natural parameters `theta` under the Ising model.
//'
//'      I(theta) = -E(H l(theta) | theta) = Cov(vech(Y Y'), vech(Y Y') | theta)
//'
//' where `H l(theta)` is the Hessian of the log-likelihood `l(theta)` defined as
//'
//'      l(theta) = n^-1 prod_i P(Y = y_i | theta)
//'               = log(p_0(theta)) - mean_i exp(vech(y_i y_i')' theta)
//'
//' Note that the Fisher information does _not_ depend on data.
//'
// [[Rcpp::export(rng = false)]]
Rcpp::NumericMatrix ising_fisher_info(const VechView& theta) {

    // Step 0: Setup (and validate) variables/parameters
    const std::size_t p = theta.dim();
    const std::size_t pp = p * (p + 1) / 2;
    const uint32_t max_event = static_cast<uint32_t>(-1) >> (32 - p);
    double sum_0 = 1.0;  // accumulates p_0(theta)^-1
    // Initialize result covariance matrix (zero initialized)
    Rcpp::NumericMatrix cov(pp, pp);

    // Compute (unscaled) lower triag part of `E(vech(Y Y') vech(Y Y')' | theta)`
    for (uint32_t y = 1; y <= max_event; ++y) {
        // Compute (unscaled) probability `P(Y = y | theta)`
        const double prob = exp(ising_log_odds_sum(y, theta));
        // and add to scaling factor accumulator
        sum_0 += prob;

        // Iterate Fisher information column indices (set bits in vech(y y'))
        // Those are the LSB lower triag indices of the `a, b` LSBs
        for (uint32_t a = y; a; a &= a - 1) {
            const std::size_t i = bitScanLS(a);
            const std::size_t ti = (i * (2 * p + 1 - i)) / 2;

            for (uint32_t b = a; b; b &= b - 1) {
                // Fisher info column index
                const std::size_t col = ti + (bitScanLS(b) - i);

                // Fill (lower triangular) Fisher information row positions
                // by computing the remaining (higher indexed) of vech(y y')
                // via the LSB bits of `c, d`
                // TODO: Slight inaccuracy with doing a bit to much work
                for (uint32_t c = a; c; c &= c - 1) {
                    const std::size_t j = bitScanLS(c);
                    const std::size_t tj = (j * (2 * p + 1 - j)) / 2;

                    for (uint32_t d = c; d; d &= d - 1) {
                        // Fisher info row index (lower triag part)
                        const std::size_t row = tj + (bitScanLS(d) - j);

                        // add (unscaled) probability for current event effects
                        cov[row + pp * col] += prob;
                    }
                }
            }
        }
    }

    // finish scaling factor
    const double p_0 = 1.0 / sum_0;

    // Scale and mirrow lower triag part of `E(vech(Y Y') vech(Y Y')' | theta)`
    for (std::size_t col = 0; col < pp; col++) {
        for (std::size_t row = col; row < pp; ++row) {
            cov[col + pp * row] = (cov[row + pp * col] *= p_0);
        }
    }

    // Subtract outer product of the expectation
    // `cov -= E(vech(Y Y') | theta) E(vech(Y Y') | theta)'`
    auto score = ising_marginal_probs(theta);
    std::transform(cov.begin(), cov.end(),
        Rcpp::outer(score, score, std::multiplies<double>()).begin(),
        cov.begin(), std::minus<double>());

    return cov;
}

//' Samples from the Ising model given the natural parameters `theta`
//'
// [[Rcpp::export(rng = true)]]
MVBinary ising_sample(const std::size_t n, const VechView& theta,
    const std::size_t warmup = 1000
) {
    const std::size_t p = theta.dim();
    const std::size_t max_event = static_cast<uint32_t>(-1) >> (32 - p);

    // for small dimensions we use Rcpp::sample
    if (p < 5) {
        // setup complete probability vector for every possible event
        const auto probs = ising_probs(theta);

        // sample from all possible events given probabilities for each
        const auto events = Rcpp::sample(1 << p, n, true, probs, false);

        return MVBinary(events.begin(), events.end(), p);
    }
    // Else: "non trivial implementation" case: Gibbs sampling

    // RNG for continuous uniform `U[0, 1]`
    auto runif = Rcpp::stats::UnifGenerator(0, 1);

    // conditional probability `P(Y_i = 1 | Y_-i = y_-i) = exp_i / (1 + exp_i)`
    // where `exp_i = exp(theta_i + sum_{j != i} y_j theta_{i,j})`
    auto icond_prob = [&theta](const std::size_t i, const uint32_t y) {
        double log_odds = theta(i);
        for (uint32_t a = y & ~(static_cast<uint32_t>(1) << i); a; a &= a - 1) {
            const std::size_t j = bitScanLS(a);
            log_odds += theta(std::min(i, j), std::max(i, j));
        }
        return 1.0 / (1.0 + exp(-log_odds));
    };

    // Reserve result data set
    MVBinary events(n, p);

    // repeat untill `n` samples are drawn
    while (events.size() < n) {
        // Initialize sample as vector of independent bernoulli draws with
        // component wise probability `P(Y_i = 1 | Y_-i = 0)`
        uint32_t y = 0;
        for (std::size_t i = 0; i < p; ++i) {
            y |= static_cast<uint32_t>((runif() * (1.0 + exp(-theta(i)))) < 1.0) << i;
        }

        // repeated cyclic updating
        for (std::size_t rep = 0; rep < warmup; ++rep) {
            for (std::size_t i = 0; i < p; ++i) {
                // event with the `i`th component `1` and all others `0`
                const uint32_t ei = static_cast<uint32_t>(1) << i;
                // sample `i`th bit from `Bernoulli(P(Y_i = 1 | Y_-i = y_-i))`
                y = (y & ~ei) | (ei * (runif() < icond_prob(i, y)));
            }
        }

        // add generated sample to result set
        events.push_back(y);
    }

    return events;
}