#include <stdlib.h>
#include <math.h>

#include "sums.h"
#include "sweep.h"
#include "matrix.h"
#include "indexing.h"

// TODO: clarify
static inline double gaussKernel(const double x, const double scale) {
    return exp(scale * x * x);
}

// TODO: mutch potential for optimization!!!
static void weightedYandLoss(const int n,
                             const double *Y,
                             const double *vecD,
                             const double *vecW,
                             const double *colSums,
                             double *y1, double *L, double *vecS,
                             double *loss) {
    int i, j, k, N = n * (n - 1) / 2;
    double l;

    for (i = 0; i < n; ++i) {
        y1[i] = Y[i] / colSums[i];
        L[i] = Y[i] * Y[i] / colSums[i];
    }

    for (k = j = 0; j < n; ++j) {
        for (i = j + 1; i < n; ++i, ++k) {
            y1[i] += Y[j] * vecW[k] / colSums[i];
            y1[j] += Y[i] * vecW[k] / colSums[j];
            L[i] += Y[j] * Y[j] * vecW[k] / colSums[i];
            L[j] += Y[i] * Y[i] * vecW[k] / colSums[j];
        }
    }

    l = 0.0;
    for (i = 0; i < n; ++i) {
        l += (L[i] -= y1[i] * y1[i]);
    }
    *loss = l / (double)n;

    for (k = j = 0; j < n; ++j) {
        for (i = j + 1; i < n; ++i, ++k) {
            l = Y[j] - y1[i];
            vecS[k]  = (L[i] - (l * l)) / colSums[i];
            l = Y[i] - y1[j];
            vecS[k] += (L[j] - (l * l)) / colSums[j];
        }
    }

    for (k = 0; k < N; ++k) {
        vecS[k] *= vecW[k] * vecD[k];
    }
}

void grad(const int n, const int p, const int q,
          const double *X,
          const double *X_diff,
          const double *Y,
          const double *V,
          const double h,
          double *G, double *loss) {
    // Number of X_i to X_j not trivial pairs.
    int i, N = (n * (n - 1)) / 2;
    double scale = -0.5 / h;

    if (X_diff == (void*)0) {
        // TODO: ...
    }

    // Allocate and compute projection matrix `Q = I_p - V * V^T`
    double *Q = (double*)malloc(p * p * sizeof(double));
    nullProj(V, p, q, Q);

    // allocate and compute vectorized distance matrix with a temporary
    // projection of `X_diff`.
    double *vecD = (double*)malloc(N * sizeof(double));
    double *X_proj;
    if (p < 5) { // TODO: refine that!
        X_proj = (double*)malloc(N * 5 * sizeof(double));
    } else {
        X_proj = (double*)malloc(N * p * sizeof(double));
    }
    matrixprod(X_diff, N, p, Q, p, p, X_proj);
    rowSquareSums(X_proj, N, p, vecD);

    // Apply kernel to distence vector for weights computation.
    double *vecW = X_proj; // reuse memory area, no longer needed.
    for (i = 0; i < N; ++i) {
        vecW[i] = gaussKernel(vecD[i], scale);
    }
    double *colSums = X_proj + N; // still allocated!
    rowSumsSymVec(vecW, n, 1.0, colSums); // rowSums = colSums cause Sym

    // compute weighted responces of first end second momontum, aka y1, y2.
    double *y1 = X_proj + N + n;
    double *L = X_proj + N + (2 * n);
    // Allocate X_diff scaling vector `vecS`, not in `X_proj` mem area because
    // used symultanious to X_proj in final gradient computation.
    double *vecS = (double*)malloc(N * sizeof(double));
    weightedYandLoss(n, Y, vecD, vecW, colSums, y1, L, vecS, loss);

    // compute the gradient using X_proj for intermidiate scaled X_diff.
    rowSweep(X_diff, N, p, "*", vecS, X_proj);
    // reuse Q which has the required dim (p, p).
    crossprod(X_diff, N, p, X_proj, N, p, Q);
    // Product with V
    matrixprod(Q, p, p, V, p, q, G);
    // And final scaling (TODO: move into matrixprod!)
    scale = -2.0 / (((double)n) * h * h);
    N = p * q;
    for (i = 0; i < N; ++i) {
        G[i] *= scale;
    }

    free(vecS);
    free(X_proj);
    free(vecD);
    free(Q);
}