/** Methods computing or estimating the second moment of the Ising model given
 * the natural parameters of the p.m.f. in exponential family form. That is
 *      f(x) = p0(params) exp(vech(x x')' params)
 * where `params` are the natural parameters. The scaling constant `p0(params)`
 * is also the zero event `x = (0, ..., 0)` probability.
 *
 * Three different version are provided.
 * - exact: Exact method computing the true second moment `E[X X']` given the
 *      parameters `params`. This method has a runtime of `O(2^dim)` where
 *      `dim` is the dimension of the random variable `X`. This means it is
 *      infeasible for bigger (like 25 and above) values if `dim`
 * - MC: Monte-Carlo method used in case the exact method is not feasible.
 * - MC Thrd: Multi-Threaded Monte-Carlo method.
 *
 * The natural parameters `params` of the Ising model is a `dim (dim + 1) / 2`
 * dimensional numeric vector containing the log-odds. The indexing schema into
 * the parameters corresponding to the log-odds of single or two way interactions
 * with indicex `i, j` are according to the half-vectorization schema
 *
 *  i = 4'th row                  => Symmetry                       => Half-Vectorized Storage
 *
 * [ *    *    *    *    *    * ]    [ *    *    *    *    *    * ]   [ *    *    *   15    *    * ]
 * [ *    *    *    *    *    * ]    [ *    *    *    *    *    * ]   [ *    *   12   16    * ]
 * [ *    *    *    *    *    * ]    [ *    *    *    *    *    * ]   [ *    8    *   17 ]
 * [ 3    8   12   15   16   17 ]    [ 3    8   12   15    *    * ]   [ 3    *    * ]
 * [ *    *    *    *    *    * ]    [ *    *    *   16    *    * ]   [ *    * ]
 * [ *    *    *    *    *    * ]    [ *    *    *   17    *    * ]   [ * ]
 *
 */
#include <float.h>  // DBL_MAX

#include "R_api.h"
#include "bit_utils.h"
#include "int_utils.h"
#include "ising_MCMC.h"

#ifndef __STDC_NO_THREADS__
    #include <threads.h>
#endif

////////////////////////////////////////////////////////////////////////////////
// TODO: read                                                                 //
// https://developer.r-project.org/Blog/public/2019/04/18/common-protect-errors/
// and
// https://developer.r-project.org/Blog/public/2018/12/10/unprotecting-by-value/index.html
// as well as do the following PROPERLLY
// TODO: make specialized version using the parameters in given sym mat form  //
//       similar to `ising_sample()`                                          //
////////////////////////////////////////////////////////////////////////////////


// void ising_m2_exact_mat(
//     const size_t dim, const size_t ldA, const double* A,
//     double* M2
// ) {
//     double sum_0 = 1.0;

//     const uint32_t max_event = (uint32_t)1 << dim;

//     (void)memset(M2, 0, dim * dim * sizeof(double));

//     for (uint32_t X = 1; X < max_event; ++X) {
//         // Evaluate quadratic form `X' A X` using symmetry of `A`
//         double XtAX = 0.0;
//         for (uint32_t Y = X; Y; Y &= Y - 1) {
//             const int i = bitScanLS32(Y);
//             XtAX += A[i * (ldA + 1)];
//             for (uint32_t Z = Y & (Y - 1); Z; Z &= Z - 1) {
//                 XtAX += 2 * A[i * ldA + bitScanLS32(Z)];
//             }
//         }

//         const double prob_X = exp(XtAX);
//         sum_0 += prob_X;

//         for (uint32_t Y = X; Y; Y &= Y - 1) {
//             const int i = bitScanLS32(Y);
//             for (uint32_t Z = Y; Z; Z &= Z - 1) {
//                 M2[i * dim + bitScanLS32(Z)] += prob_X;
//             }
//         }
//     }

//     const double prob_0 = 1.0 / sum_0;
//     for (size_t j = 0; j < dim; ++j) {
//         for (size_t i = 0; i < j; ++i) {
//             M2[j * dim + i] = M2[i * dim + j];
//         }
//         for (size_t i = j; i < dim; ++i) {
//             M2[i * dim + j] *= prob_0;
//         }
//     }
// }

double ising_m2_exact(const size_t dim, const double* params, double* M2) {
    // Accumulator of sum of all (unscaled) probabilities
    double sum_0 = 1.0;

    // max event (actually upper bound)
    const uint32_t max_event = (uint32_t)1 << dim;

    // Length of parameters
    const size_t len = dim * (dim + 1) / 2;

    // Initialize M2 to zero
    (void)memset(M2, 0, len * sizeof(double));

    // Iterate all `2^dim` binary vectors `X`
    for (uint32_t X = 1; X < max_event; ++X) {
        // Dot product `<vech(X X'), params>`
        double dot_X = 0.0;
        // all bits in `X`
        for (uint32_t Y = X; Y; Y &= Y - 1) {
            const int i = bitScanLS32(Y);
            const int I = (i * (2 * dim - 1 - i)) / 2;
            // add single and two way interaction log odds
            for (uint32_t Z = Y; Z; Z &= Z - 1) {
                dot_X += params[I + bitScanLS32(Z)];
            }
        }

        // (unscaled) probability of current event `X`
        const double prob_X = exp(dot_X);
        sum_0 += prob_X;

        // Accumulate set bits probability for the first and second moment `E[X X']`
        for (uint32_t Y = X; Y; Y &= Y - 1) {
            const int i = bitScanLS32(Y);
            const int I = (i * (2 * dim - 1 - i)) / 2;
            for (uint32_t Z = Y; Z; Z &= Z - 1) {
                M2[I + bitScanLS32(Z)] += prob_X;
            }
        }
    }

    // Finish by scaling with zero event probability (Ising p.m.f. scaling constant)
    const double prob_0 = 1.0 / sum_0;
    for (size_t i = 0; i < len; ++i) {
        M2[i] *= prob_0;
    }

    return log(prob_0);
}


// Monte-Carlo method using Gibbs sampling for the second moment
double ising_m2_MC(
    /*   options */ const size_t nr_samples, const size_t warmup,
    /* dimension */ const size_t dim,
    /*   vectors */ const double* params, double* M2
) {

    // Length of `params` and `M2`
    const size_t len = (dim * (dim + 1)) / 2;

    // Dirty trick (reuse output memory for precise counting)
    _Static_assert(sizeof(uint64_t) == sizeof(double), "Dirty trick fails!");
    uint64_t* counts = (uint64_t*)M2;

    // Initialize counts to zero
    (void)memset(counts, 0, len * sizeof(uint64_t));

    // Allocate memory to store Markov-Chain value
    int* X = (int*)R_alloc(dim, sizeof(int));

    // Accumulator for Monte-Carlo estimate for zero probability `P(X = 0)`
    double accum = 0.0, max_mdot_X = -DBL_MAX;

    // Create/Update R's internal PRGN state
    GetRNGstate();

    // Spawn chains
    for (size_t sample = 0; sample < nr_samples; ++sample) {
        // Draw random sample `X ~ Ising(params)`
        ising_mcmc_vech(warmup, dim, params, X);

        // Accumulate component counts and compute (unscaled) probability of `X`
        uint64_t* count = counts;
        double dot_X = 0.0;
        for (size_t i = 0; i < dim; ++i) {
            const size_t I = (i * (2 * dim - 1 - i)) / 2;
            for (size_t j = i; j < dim; ++j) {
                *(count++) += X[i] & X[j];
                dot_X += (X[i] & X[j]) ? params[I + j] : 0.0;
            }
        }
        if (-dot_X > max_mdot_X) {
            accum *= exp(max_mdot_X + dot_X);
            max_mdot_X = -dot_X;
        }
        accum += exp(-dot_X - max_mdot_X);
    }

    // Write R's internal PRNG state back (if needed)
    PutRNGstate();

    // Compute means from counts
    for (size_t i = 0; i < len; ++i) {
        M2[i] = (double)counts[i] / (double)nr_samples;
    }

    // Prob. of zero event (Ising p.m.f. scaling constant)
    return log(accum) + max_mdot_X - log(2) * (double)dim - log((double)nr_samples);
}


// in case the compile supports standard C threads
#ifndef __STDC_NO_THREADS__

// Worker thread to calling thread communication struct
typedef struct thrd_data {
    rng_seed_t seed;        // Pseudo-Random-Number-Generator seed/state value
    size_t nr_samples;      // Nr. of samples this worker handles
    size_t warmup;          // Monte-Carlo Chain burne-in length
    size_t dim;             // Random variable dimension
    const double* params;   // Ising model parameters
    int* X;                 // Working memory to store current binary sample
    uint32_t* counts;       // (output) count of single and two way interactions
    double log_sum_exp;     // (output) Monte-Carlo inbetween value for `log(P(X = 0))`
} thrd_data_t;

// Worker thread function
int thrd_worker(thrd_data_t* data) {
    // Extract data as thread local variables (for convenience)
    const size_t dim = data->dim;
    int* X = data->X;

    // Initialize counts to zero
    (void)memset(data->counts, 0, (dim * (dim + 1) / 2) * sizeof(uint32_t));

    // Accumulator for Monte-Carlo estimate for zero probability `P(X = 0)`
    double accum = 0.0, max_mdot_X = -DBL_MAX;

    // Spawn Monte-Carlo Chains (one foe every sample)
    for (size_t sample = 0; sample < data->nr_samples; ++sample) {
        // Draw random sample `X ~ Ising(params)`
        ising_mcmc_vech_thrd(data->warmup, dim, data->params, X, data->seed);

        // Accumulate component counts and compute (unscaled) probability of `X`
        uint32_t* count = data->counts;
        double dot_X = 0.0;
        for (size_t i = 0; i < dim; ++i) {
            const size_t I = (i * (2 * dim - 1 - i)) / 2;
            for (size_t j = i; j < dim; ++j) {
                *(count++) += X[i] & X[j];
                dot_X += (X[i] & X[j]) ? data->params[I + j] : 0.0;
            }
        }
         if (-dot_X > max_mdot_X) {
            accum *= exp(max_mdot_X + dot_X);
            max_mdot_X = -dot_X;
        }
        accum += exp(-dot_X - max_mdot_X);
   }

    data->log_sum_exp = log(accum) + max_mdot_X;

    return 0;
}

double ising_m2_MC_thrd(
    /*   options */ const size_t nr_samples, const size_t warmup,
                    const size_t nr_threads,
    /* dimension */ const size_t dim,
    /*   vectors */ const double* params, double* M2
) {
    // Length of `params` and `M2`
    const size_t len = (dim * (dim + 1)) / 2;

    // Allocate working and self memory for worker threads
    int* Xs = (int*)R_alloc(dim * nr_threads, sizeof(int));
    uint32_t* counts = (uint32_t*)R_alloc(len * nr_threads, sizeof(uint32_t));
    thrd_t* threads = (thrd_t*)R_alloc(nr_threads, sizeof(thrd_data_t));
    thrd_data_t* threads_data = (thrd_data_t*)R_alloc(nr_threads, sizeof(thrd_data_t));

    // Provide instruction wor worker and dispatch them
    for (size_t tid = 0; tid < nr_threads; ++tid) {
        // Every thread needs its own PRNG seed!
        init_seed(threads_data[tid].seed);
        // divide work among workers (more or less) equaly with the first worker
        // (tid = 0) having the additional remainder of divided work (`nr_samples`)
        threads_data[tid].nr_samples = nr_samples / nr_threads + (
            tid ? 0 : nr_samples % nr_threads
        );
        threads_data[tid].warmup = warmup;
        threads_data[tid].dim = dim;
        threads_data[tid].params = params;
        // Every worker gets disjoint working memory
        threads_data[tid].X = &Xs[dim * tid];
        threads_data[tid].counts = &counts[len * tid];
        // dispatch the worker
        thrd_create(&threads[tid], (thrd_start_t)thrd_worker, (void*)&threads_data[tid]);
    }

    // Join (Wait for) all worker
    for (size_t tid = 0; tid < nr_threads; ++tid) {
        thrd_join(threads[tid], NULL);
    }

    // Accumulate worker results into first (tid = 0) worker counts result
    // while determining the log sum exp maximum
    double max = threads_data[0].log_sum_exp;
    for (size_t tid = 1; tid < nr_threads; ++tid) {
        for (size_t i = 0; i < len; ++i) {
            counts[i] += counts[tid * len + i];
        }
        max = (max < threads_data[tid].log_sum_exp) ? threads_data[tid].log_sum_exp : max;
    }

    // Accum all `log(P(X = 0))` via "LogSumExp" trick into final result
    double accum = 0;
    for (size_t tid = 0; tid < nr_threads; ++tid) {
        accum += exp(threads_data[tid].log_sum_exp - max);
    }

    // convert discreat counts into means
    for (size_t i = 0; i < len; ++i) {
        M2[i] = (double)counts[i] / (double)nr_samples;
    }

    // Log of Prob. of zero event (Ising p.m.f. scaling constant)
    return log(accum) + max - log(2) * (double)dim - log((double)nr_samples);
}

#endif /* !__STDC_NO_THREADS__ */


/* Implementation of `.Call` entry point */
////////////////////////////////////////////////////////////////////////////////
// TODO: make specialized version using the parameters in given sym mat form  //
//       similar to `ising_sample()`                                          //
////////////////////////////////////////////////////////////////////////////////
extern SEXP R_ising_m2(
    SEXP _params, SEXP _use_MC, SEXP _nr_samples, SEXP _warmup, SEXP _nr_threads
) {
    // Extract and validate arguments
    size_t protect_count = 0;
    if (!Rf_isReal(_params)) {
        _params = PROTECT(Rf_coerceVector(_params, REALSXP));
        protect_count++;
    }

    // Depending on parameters given in symmetric matrix form or natural
    // or natrual parameters in the half-vectorized memory layout
    size_t dim;
    double* params;
    if (Rf_isMatrix(_params)) {
        if (Rf_nrows(_params) != Rf_ncols(_params)) {
            Rf_error("Invalid 'params' value, exected square matrix");
        }
        // Convert to natural parameters
        dim = Rf_nrows(_params);
        params = (double*)R_alloc(dim * (dim + 1) / 2, sizeof(double));

        double* A = REAL(_params);
        for (size_t j = 0, I = 0; j < dim; ++j) {
            for (size_t i = j; i < dim; ++i, ++I) {
                params[I] = (1 + (i != j)) * A[i + j * dim];
            }
        }
    } else {
        // Get Ising random variable dimension `dim` from "half-vectorized" parameters
        // vector. That is, compute `dim` from `length(params) = dim (dim + 1) / 2`.
        dim = invTriag(Rf_length(_params));
        if (!dim) {
            Rf_error("Expected parameter vector of length `p (p + 1) / 2` where"
                " `p` is the dimension of the random variable");
        }
        params = REAL(_params);
    }

    // Determin the method to use and validate if possible
    const int use_MC = Rf_asLogical(_use_MC);
    if (use_MC == NA_LOGICAL) {
        Rf_error("Invalid 'use_MC' value, expected ether TRUE or FALSE");
    }

    // Allocate result vector
    SEXP _M2 = PROTECT(Rf_allocVector(REALSXP, dim * (dim + 1) / 2));
    ++protect_count;

    // asside computed log of zero event probability (log of recibrocal partition
    // function), the log scaling factor for the Ising model p.m.f.
    double log_prob_0 = -1.0;

    if (use_MC) {
        // Convert and validate arguments for the Monte-Carlo methods
        const size_t nr_samples = asUnsigned(_nr_samples);
        if (nr_samples == 0 || nr_samples == NA_UNSIGNED) {
            Rf_error("Invalid 'nr_samples' value, expected pos. integer");
        }
        const size_t warmup = asUnsigned(_warmup);
        if (warmup == NA_UNSIGNED) {
            Rf_error("Invalid 'warmup' value, expected non-negative integer");
        }
        const size_t nr_threads = asUnsigned(_nr_threads);
        if (nr_threads == 0 || nr_threads > 256) {
            Rf_error("Invalid 'nr_thread' value");
        }

        if (nr_threads == 1) {
            // Single threaded Monte-Carlo method
            log_prob_0 = ising_m2_MC(nr_samples, warmup, dim, params, REAL(_M2));
        } else {
            // Multi-Threaded Monte-Carlo method if provided, otherwise use
            // the single threaded version with a warning
#ifdef __STDC_NO_THREADS__
                Rf_warning("Multi-Threading NOT supported, using fallback.");
                log_prob_0 = ising_m2_MC(nr_samples, warmup, dim, params, REAL(_M2));
#else
                log_prob_0 = ising_m2_MC_thrd(
                    nr_samples, warmup, nr_threads,
                    dim, params, REAL(_M2)
                );
#endif
        }
    } else {
        // Exact method (ignore other arguments), only validate dimension
        if (25 < dim) {
            Rf_error("Dimension '%ld' too big for exact method (max 24)", dim);
        }

        // and call the exact method
        log_prob_0 = ising_m2_exact(dim, params, REAL(_M2));
    }

    // Set log-lokelihood as an attribute to the computed second moment
    SEXP _log_prob_0 = PROTECT(Rf_ScalarReal(log_prob_0));
    ++protect_count;
    Rf_setAttrib(_M2, Rf_install("log_prob_0"), _log_prob_0);

    // release SEPXs to the garbage collector
    UNPROTECT(protect_count);

    return _M2;
}