// The need for `USE_FC_LEN_T` and `FCONE` is due to a Fortran character string
// to C incompatibility. See: Writing R Extentions: 6.6.1 Fortran character strings
#define USE_FC_LEN_T
#include <R.h>
#include <Rinternals.h>
#include <R_ext/BLAS.h>
#ifndef FCONE
#define FCONE
#endif

void mcrossprod(
    const int rank,
    const double* A, const int* dimA,
    const double* B, const int* dimB,
    const int mode,
    double* C
) {
    // the strides
    //  `stride[0] <- prod(dim(A)[seq_len(mode - 1)])`
    //  `stride[1] <- dim(A)[mode]`
    //  `stride[2] <- prod(dim(A)[-seq_len(mode)])`
    // Note: Middle stride is ignored (to be consistent with sym version)
    int stride[3] = {1, 0, 1};
    for (int i = 0; i < rank; ++i) {
        int size = dimA[i];
        stride[0] *= (i < mode) ? size : 1;
        stride[2] *= (i > mode) ? size : 1;
    }

    // employ BLAS dgemm (Double GEneralized Matrix Matrix) operation
    // (C = alpha op(A) op(A) + beta C, op is the identity of transposition)
    const double zero = 0.0;
    const double one  = 1.0;
    if (mode == 0) {
        // mode 1: special case C = A_(1) B_(1)^T
        // C = 1 A B^T + 0 C
        F77_CALL(dgemm)("N", "T", &dimA[mode], &dimB[mode], &stride[2],
            &one, A, &dimA[mode], B, &dimB[mode],
            &zero, C, &dimA[mode] FCONE FCONE);
    } else {
        // Other modes writen as accumulated sum of matrix products
        // initialize C to zero
        memset(C, 0, dimA[mode] * dimB[mode] * sizeof(double));

        // Sum over all modes > mode
        for (int i2 = 0; i2 < stride[2]; ++i2) {
            // C = 1 A^T B + 1 C
            F77_CALL(dgemm)("T", "N", &dimA[mode], &dimB[mode], &stride[0],
                &one, &A[i2 * stride[0] * dimA[mode]], &stride[0],
                &B[i2 * stride[0] * dimB[mode]], &stride[0],
                &one, C, &dimA[mode] FCONE FCONE);
        }
    }
}

/**
 * Tensor Mode Crossproduct
 *
 *      C = A_(m) t(B_(m))
 *
 * For a matrices `A` and `B`, the first mode `mcrossprod(A, B, 1)` is equivalent
 * to `A %*% t(B)` (`tcrossprod`). On the other hand for mode two
 * `mcrossprod(A, B, 2)` the equivalence is `t(A) %*% B` (`crossprod`).
 *
 * @param A multi-dimensional array
 * @param B multi-dimensional array
 * @param m mode index (1-indexed)
 */
extern SEXP R_mcrossprod(SEXP A, SEXP B, SEXP m) {
    // get zero indexed mode
    int mode = asInteger(m) - 1;

    // Check if both `A` and `B` are real-valued
    if (!isReal(A) || !isReal(B)) {
        error("Type missmatch, both `A` and `B` must be real-valued");
    }

    // get dimension attributes
    SEXP dimA_sexp = getAttrib(A, R_DimSymbol);
    SEXP dimB_sexp = getAttrib(B, R_DimSymbol);

    // dimension type validation
    if (!isInteger(dimA_sexp) || !isInteger(dimB_sexp)) {
        error("Dimensions must be integer vectors");
    }

    // validate dimensions
    int rank = length(dimA_sexp);
    if (rank != length(dimB_sexp)) {
        error("Dimension mismatch");
    }
    // validate mode (0-indexed, must be smaller than the tensor order)
    if (mode < 0 || rank <= mode) {
        error("Illegal mode");
    }

    // get raw pointers to dimensions
    int* dimA = INTEGER(coerceVector(dimA_sexp, INTSXP));
    int* dimB = INTEGER(coerceVector(dimB_sexp, INTSXP));

    // finaly, check for `A` and `B` dimensions to match
    for (int i = 0; i < rank; ++i) {
        if (i != mode && dimA[i] != dimB[i]) {
            error("Dimension mismatch");
        } 
    }

    // create response matrix C
    SEXP C = PROTECT(allocMatrix(REALSXP, dimA[mode], dimB[mode]));

    // Call C mode crossprod subroutine
    mcrossprod(
        rank,           // tensor rank of both `A` and `B`
        REAL(A), dimA,  // mem. addr. of A, dim(A)
        REAL(B), dimB,  // mem. addr. of B, dim(B)
        mode,           // the crossproduct mode to compute
        REAL(C)         // return value memory addr.
    );

    // release C to garbage collector
    UNPROTECT(1);

    return C;
}

/**
 * Symmetric Tensor Mode Crossproduct
 *
 *      C = A_(m) t(A_(m))
 *
 * For a matrix `A`, the first mode is `mcrossprod_sym(A, 1)` equivalent to
 * `A %*% t(A)` (`tcrossprod`). On the other hand for mode two
 * `mcrossprod(A, 2)` the equivalence is `t(A) %*% A` (`crossprod`).
 *
 * @param A multi-dimensional array
 * @param m mode index (1-indexed)
 */
extern SEXP R_mcrossprod_sym(SEXP A, SEXP m) {
    // get zero indexed mode
    int mode = asInteger(m) - 1;

    // get dimension attribute of A
    SEXP dim = getAttrib(A, R_DimSymbol);

    // validate mode (0-indexed, must be smaller than the tensor order)
    if (mode < 0 || length(dim) <= mode) {
        error("Illegal mode");
    }

    // the strides
    //  `stride[0] <- prod(dim(X)[seq_len(mode - 1)])`
    //  `stride[1] <- dim(X)[mode]`
    //  `stride[2] <- prod(dim(X)[-seq_len(mode)])`
    int stride[3] = {1, INTEGER(dim)[mode], 1};
    for (int i = 0; i < length(dim); ++i) {
        int size = INTEGER(dim)[i];
        stride[0] *= (i < mode) ? size : 1;
        stride[2] *= (i > mode) ? size : 1;
    }

    // create response matrix C
    SEXP C = PROTECT(allocMatrix(REALSXP, stride[1], stride[1]));

    // raw data access pointers
    double* a = REAL(A);
    double* c = REAL(C);

    // employ BLAS dsyrk (Double SYmmeric Rank K) operation
    // (C = alpha A A^T + beta C or C = alpha A^T A + beta C)
    const double zero = 0.0;
    const double one  = 1.0;
    if (mode == 0) {
        // mode 1: special case C = A_(1) A_(1)^T
        // C = 1 A A^T + 0 C
        F77_CALL(dsyrk)("U", "N", &stride[1], &stride[2],
            &one, a, &stride[1], &zero, c, &stride[1] FCONE FCONE);
    } else {
        // Other modes writen as accumulated sum of matrix products
        // initialize C to zero
        memset(c, 0, stride[1] * stride[1] * sizeof(double));

        // Sum over all modes > mode
        for (int i2 = 0; i2 < stride[2]; ++i2) {
            // C = 1 A^T A + 1 C
            F77_CALL(dsyrk)("U", "T", &stride[1], &stride[0],
                &one, &a[i2 * stride[0] * stride[1]], &stride[0],
                &one, c, &stride[1] FCONE FCONE);
        }
    }

    // Symmetric matrix result is stored in upper triangular part only
    // Copy upper triangular part to lower
    for (int j = 0; j + 1 < stride[1]; j++) {
        for (int i = j + 1; i < stride[1]; ++i) {
            c[i + j * stride[1]] = c[j + i * stride[1]];
        }
    }

    // release C to grabage collector
    UNPROTECT(1);

    return C;
}