2
0
Fork 0
CVE/wip.c

422 lines
13 KiB
C

#include <stdlib.h>
#include <R_ext/BLAS.h>
#include <R_ext/Lapack.h>
#include <R_ext/Error.h>
// #include <Rmath.h>
#include "wip.h"
static inline void rowSums(const double *A,
const int nrow, const int ncol,
double *sum) {
int i, j, block_size, block_size_i;
const double *A_block = A;
const double *A_end = A + nrow * ncol;
if (nrow > CVE_MEM_CHUNK_SIZE) {
block_size = CVE_MEM_CHUNK_SIZE;
} else {
block_size = nrow;
}
// Iterate `(block_size_i, ncol)` submatrix blocks.
for (i = 0; i < nrow; i += block_size_i) {
// Reset `A` to new block beginning.
A = A_block;
// Take block size of eveything left and reduce to max size.
block_size_i = nrow - i;
if (block_size_i > block_size) {
block_size_i = block_size;
}
// Compute first blocks column,
for (j = 0; j < block_size_i; ++j) {
sum[j] = A[j];
}
// and sum the following columns to the first one.
for (A += nrow; A < A_end; A += nrow) {
for (j = 0; j < block_size_i; ++j) {
sum[j] += A[j];
}
}
// Step one block forth.
A_block += block_size_i;
sum += block_size_i;
}
}
static inline void colSums(const double *A,
const int nrow, const int ncol,
double *sum) {
int j;
double *sum_end = sum + ncol;
memset(sum, 0, sizeof(double) * ncol);
for (; sum < sum_end; ++sum) {
for (j = 0; j < nrow; ++j) {
*sum += A[j];
}
A += nrow;
}
}
static inline void rowSquareSums(const double *A,
const int nrow, const int ncol,
double *sum) {
int i, j, block_size, block_size_i;
const double *A_block = A;
const double *A_end = A + nrow * ncol;
if (nrow < CVE_MEM_CHUNK_SIZE) {
block_size = nrow;
} else {
block_size = CVE_MEM_CHUNK_SIZE;
}
// Iterate `(block_size_i, ncol)` submatrix blocks.
for (i = 0; i < nrow; i += block_size_i) {
// Reset `A` to new block beginning.
A = A_block;
// Take block size of eveything left and reduce to max size.
block_size_i = nrow - i;
if (block_size_i > block_size) {
block_size_i = block_size;
}
// Compute first blocks column,
for (j = 0; j < block_size_i; ++j) {
sum[j] = A[j] * A[j];
}
// and sum the following columns to the first one.
for (A += nrow; A < A_end; A += nrow) {
for (j = 0; j < block_size_i; ++j) {
sum[j] += A[j] * A[j];
}
}
// Step one block forth.
A_block += block_size_i;
sum += block_size_i;
}
}
static inline void rowSumsSymVec(const double *Avec, const int nrow,
const double *diag,
double *sum) {
int i, j;
if (*diag == 0.0) {
memset(sum, 0, nrow * sizeof(double));
} else {
for (i = 0; i < nrow; ++i) {
sum[i] = *diag;
}
}
for (j = 0; j < nrow; ++j) {
for (i = j + 1; i < nrow; ++i, ++Avec) {
sum[j] += *Avec;
sum[i] += *Avec;
}
}
}
/* C[, j] = A[, j] * v for each j = 1 to ncol */
static void rowSweep(const double *A, const int nrow, const int ncol,
const char* op,
const double *v, // vector of length nrow
double *C) {
int i, j, block_size, block_size_i;
const double *A_block = A;
double *C_block = C;
const double *A_end = A + nrow * ncol;
if (nrow > CVE_MEM_CHUNK_SMALL) { // small because 3 vectors in cache
block_size = CVE_MEM_CHUNK_SMALL;
} else {
block_size = nrow;
}
if (*op == '+') {
// Iterate `(block_size_i, ncol)` submatrix blocks.
for (i = 0; i < nrow; i += block_size_i) {
// Set `A` and `C` to block beginning.
A = A_block;
C = C_block;
// Get current block's row size.
block_size_i = nrow - i;
if (block_size_i > block_size) {
block_size_i = block_size;
}
// Perform element wise operation for block.
for (; A < A_end; A += nrow, C += nrow) {
for (j = 0; j < block_size_i; ++j) {
C[j] = A[j] + v[j]; // FUN = '+'
}
}
// Step one block forth.
A_block += block_size_i;
C_block += block_size_i;
v += block_size_i;
}
} else if (*op == '-') {
// Iterate `(block_size_i, ncol)` submatrix blocks.
for (i = 0; i < nrow; i += block_size_i) {
// Set `A` and `C` to block beginning.
A = A_block;
C = C_block;
// Get current block's row size.
block_size_i = nrow - i;
if (block_size_i > block_size) {
block_size_i = block_size;
}
// Perform element wise operation for block.
for (; A < A_end; A += nrow, C += nrow) {
for (j = 0; j < block_size_i; ++j) {
C[j] = A[j] - v[j]; // FUN = '-'
}
}
// Step one block forth.
A_block += block_size_i;
C_block += block_size_i;
v += block_size_i;
}
} else if (*op == '*') {
// Iterate `(block_size_i, ncol)` submatrix blocks.
for (i = 0; i < nrow; i += block_size_i) {
// Set `A` and `C` to block beginning.
A = A_block;
C = C_block;
// Get current block's row size.
block_size_i = nrow - i;
if (block_size_i > block_size) {
block_size_i = block_size;
}
// Perform element wise operation for block.
for (; A < A_end; A += nrow, C += nrow) {
for (j = 0; j < block_size_i; ++j) {
C[j] = A[j] * v[j]; // FUN = '*'
}
}
// Step one block forth.
A_block += block_size_i;
C_block += block_size_i;
v += block_size_i;
}
} else if (*op == '/') {
// Iterate `(block_size_i, ncol)` submatrix blocks.
for (i = 0; i < nrow; i += block_size_i) {
// Set `A` and `C` to block beginning.
A = A_block;
C = C_block;
// Get current block's row size.
block_size_i = nrow - i;
if (block_size_i > block_size) {
block_size_i = block_size;
}
// Perform element wise operation for block.
for (; A < A_end; A += nrow, C += nrow) {
for (j = 0; j < block_size_i; ++j) {
C[j] = A[j] / v[j]; // FUN = '/'
}
}
// Step one block forth.
A_block += block_size_i;
C_block += block_size_i;
v += block_size_i;
}
} else {
error("Got unknown 'op' (opperation) argument.");
}
}
static inline void matrixprod(const double *A, const int nrowA, const int ncolA,
const double *B, const int nrowB, const int ncolB,
double *C) {
const double one = 1.0;
const double zero = 0.0;
// DGEMM with parameterization:
// C <- A %*% B
F77_NAME(dgemm)("N", "N", &nrowA, &ncolB, &ncolA,
&one, A, &nrowA, B, &nrowB,
&zero, C, &nrowA);
}
static inline void crossprod(const double *A, const int nrowA, const int ncolA,
const double *B, const int nrowB, const int ncolB,
double *C) {
const double one = 1.0;
const double zero = 0.0;
// DGEMM with parameterization:
// C <- t(A) %*% B
F77_NAME(dgemm)("T", "N", &ncolA, &ncolB, &nrowA,
&one, A, &nrowA, B, &nrowB,
&zero, C, &ncolA);
}
static inline void nullProj(const double *B, const int nrowB, const int ncolB,
double *Q) {
const double minusOne = -1.0;
const double one = 1.0;
// Initialize as identity matrix.
memset(Q, 0, sizeof(double) * nrowB * nrowB);
double *Q_diag, *Q_end = Q + nrowB * nrowB;
for (Q_diag = Q; Q_diag < Q_end; Q_diag += nrowB + 1) {
*Q_diag = 1.0;
}
// DGEMM with parameterization:
// C <- (-1.0 * B %*% t(B)) + C
F77_NAME(dgemm)("N", "T", &nrowB, &nrowB, &ncolB,
&minusOne, B, &nrowB, B, &nrowB,
&one, Q, &nrowB);
}
static inline void rangePairs(const int from, const int to, int *pairs) {
int i, j;
for (i = from; i < to; ++i) {
for (j = i + 1; j < to; ++j) {
pairs[0] = i;
pairs[1] = j;
pairs += 2;
}
}
}
// A dence skwe-symmetric rank 2 update.
// Perform the update
// C := alpha (A * B^T - B * A^T) + beta C
static void skewSymRank2k(const int nrow, const int ncol,
double alpha, const double *A, const double *B,
double beta,
double *C) {
F77_NAME(dgemm)("N", "T",
&nrow, &nrow, &ncol,
&alpha, A, &nrow, B, &nrow,
&beta, C, &nrow);
alpha *= -1.0;
beta = 1.0;
F77_NAME(dgemm)("N", "T",
&nrow, &nrow, &ncol,
&alpha, B, &nrow, A, &nrow,
&beta, C, &nrow);
}
// TODO: mutch potential for optimization!!!
static inline void weightedYandLoss(const int n,
const double *Y,
const double *vecD,
const double *vecW,
const double *colSums,
double *y1, double *L, double *vecS,
double *const loss) {
int i, j, k, N = n * (n - 1) / 2;
double l;
for (i = 0; i < n; ++i) {
y1[i] = Y[i] / colSums[i];
L[i] = Y[i] * Y[i] / colSums[i];
}
for (k = j = 0; j < n; ++j) {
for (i = j + 1; i < n; ++i, ++k) {
y1[i] += Y[j] * vecW[k] / colSums[i];
y1[j] += Y[i] * vecW[k] / colSums[j];
L[i] += Y[j] * Y[j] * vecW[k] / colSums[i];
L[j] += Y[i] * Y[i] * vecW[k] / colSums[j];
}
}
l = 0.0;
for (i = 0; i < n; ++i) {
l += (L[i] -= y1[i] * y1[i]);
}
*loss = l / (double)n;
for (k = j = 0; j < n; ++j) {
for (i = j + 1; i < n; ++i, ++k) {
l = Y[j] - y1[i];
vecS[k] = (L[i] - (l * l)) / colSums[i];
l = Y[i] - y1[j];
vecS[k] += (L[j] - (l * l)) / colSums[j];
}
}
for (k = 0; k < N; ++k) {
vecS[k] *= vecW[k] * vecD[k];
}
}
inline double gaussKernel(const double x, const double scale) {
return exp(scale * x * x);
}
static void gradient(const int n, const int p, const int q,
const double *X,
const double *X_diff,
const double *Y,
const double *V,
const double h,
double *G, double *const loss) {
// Number of X_i to X_j not trivial pairs.
int i, N = (n * (n - 1)) / 2;
double scale = -0.5 / h;
const double one = 1.0;
if (X_diff == (void*)0) {
// TODO: ...
}
// Allocate and compute projection matrix `Q = I_p - V * V^T`
double *Q = (double*)malloc(p * p * sizeof(double));
nullProj(V, p, q, Q);
// allocate and compute vectorized distance matrix with a temporary
// projection of `X_diff`.
double *vecD = (double*)malloc(N * sizeof(double));
double *X_proj;
if (p < 5) { // TODO: refine that!
X_proj = (double*)malloc(N * 5 * sizeof(double));
} else {
X_proj = (double*)malloc(N * p * sizeof(double));
}
matrixprod(X_diff, N, p, Q, p, p, X_proj);
rowSquareSums(X_proj, N, p, vecD);
// Apply kernel to distence vector for weights computation.
double *vecW = X_proj; // reuse memory area, no longer needed.
for (i = 0; i < N; ++i) {
vecW[i] = gaussKernel(vecD[i], scale);
}
double *colSums = X_proj + N; // still allocated!
rowSumsSymVec(vecW, n, &one, colSums); // rowSums = colSums cause Sym
// compute weighted responces of first end second momontum, aka y1, y2.
double *y1 = X_proj + N + n;
double *L = X_proj + N + (2 * n);
// Allocate X_diff scaling vector `vecS`, not in `X_proj` mem area because
// used symultanious to X_proj in final gradient computation.
double *vecS = (double*)malloc(N * sizeof(double));
weightedYandLoss(n, Y, vecD, vecW, colSums, y1, L, vecS, loss);
// compute the gradient using X_proj for intermidiate scaled X_diff.
rowSweep(X_diff, N, p, "*", vecS, X_proj);
// reuse Q which has the required dim (p, p).
crossprod(X_diff, N, p, X_proj, N, p, Q);
// Product with V
matrixprod(Q, p, p, V, p, q, G);
// And final scaling (TODO: move into matrixprod!)
scale = -2.0 / (((double)n) * h * h);
N = p * q;
for (i = 0; i < N; ++i) {
G[i] *= scale;
}
free(vecS);
free(X_proj);
free(vecD);
free(Q);
}