#include #include #include #include // #include #include "wip.h" static inline void rowSums(const double *A, const int nrow, const int ncol, double *sum) { int i, j, block_size, block_size_i; const double *A_block = A; const double *A_end = A + nrow * ncol; if (nrow > CVE_MEM_CHUNK_SIZE) { block_size = CVE_MEM_CHUNK_SIZE; } else { block_size = nrow; } // Iterate `(block_size_i, ncol)` submatrix blocks. for (i = 0; i < nrow; i += block_size_i) { // Reset `A` to new block beginning. A = A_block; // Take block size of eveything left and reduce to max size. block_size_i = nrow - i; if (block_size_i > block_size) { block_size_i = block_size; } // Compute first blocks column, for (j = 0; j < block_size_i; ++j) { sum[j] = A[j]; } // and sum the following columns to the first one. for (A += nrow; A < A_end; A += nrow) { for (j = 0; j < block_size_i; ++j) { sum[j] += A[j]; } } // Step one block forth. A_block += block_size_i; sum += block_size_i; } } static inline void colSums(const double *A, const int nrow, const int ncol, double *sum) { int j; double *sum_end = sum + ncol; memset(sum, 0, sizeof(double) * ncol); for (; sum < sum_end; ++sum) { for (j = 0; j < nrow; ++j) { *sum += A[j]; } A += nrow; } } static inline void rowSquareSums(const double *A, const int nrow, const int ncol, double *sum) { int i, j, block_size, block_size_i; const double *A_block = A; const double *A_end = A + nrow * ncol; if (nrow < CVE_MEM_CHUNK_SIZE) { block_size = nrow; } else { block_size = CVE_MEM_CHUNK_SIZE; } // Iterate `(block_size_i, ncol)` submatrix blocks. for (i = 0; i < nrow; i += block_size_i) { // Reset `A` to new block beginning. A = A_block; // Take block size of eveything left and reduce to max size. block_size_i = nrow - i; if (block_size_i > block_size) { block_size_i = block_size; } // Compute first blocks column, for (j = 0; j < block_size_i; ++j) { sum[j] = A[j] * A[j]; } // and sum the following columns to the first one. for (A += nrow; A < A_end; A += nrow) { for (j = 0; j < block_size_i; ++j) { sum[j] += A[j] * A[j]; } } // Step one block forth. A_block += block_size_i; sum += block_size_i; } } static inline void rowSumsSymVec(const double *Avec, const int nrow, const double *diag, double *sum) { int i, j; if (*diag == 0.0) { memset(sum, 0, nrow * sizeof(double)); } else { for (i = 0; i < nrow; ++i) { sum[i] = *diag; } } for (j = 0; j < nrow; ++j) { for (i = j + 1; i < nrow; ++i, ++Avec) { sum[j] += *Avec; sum[i] += *Avec; } } } /* C[, j] = A[, j] * v for each j = 1 to ncol */ static void rowSweep(const double *A, const int nrow, const int ncol, const char* op, const double *v, // vector of length nrow double *C) { int i, j, block_size, block_size_i; const double *A_block = A; double *C_block = C; const double *A_end = A + nrow * ncol; if (nrow > CVE_MEM_CHUNK_SMALL) { // small because 3 vectors in cache block_size = CVE_MEM_CHUNK_SMALL; } else { block_size = nrow; } if (*op == '+') { // Iterate `(block_size_i, ncol)` submatrix blocks. for (i = 0; i < nrow; i += block_size_i) { // Set `A` and `C` to block beginning. A = A_block; C = C_block; // Get current block's row size. block_size_i = nrow - i; if (block_size_i > block_size) { block_size_i = block_size; } // Perform element wise operation for block. for (; A < A_end; A += nrow, C += nrow) { for (j = 0; j < block_size_i; ++j) { C[j] = A[j] + v[j]; // FUN = '+' } } // Step one block forth. A_block += block_size_i; C_block += block_size_i; v += block_size_i; } } else if (*op == '-') { // Iterate `(block_size_i, ncol)` submatrix blocks. for (i = 0; i < nrow; i += block_size_i) { // Set `A` and `C` to block beginning. A = A_block; C = C_block; // Get current block's row size. block_size_i = nrow - i; if (block_size_i > block_size) { block_size_i = block_size; } // Perform element wise operation for block. for (; A < A_end; A += nrow, C += nrow) { for (j = 0; j < block_size_i; ++j) { C[j] = A[j] - v[j]; // FUN = '-' } } // Step one block forth. A_block += block_size_i; C_block += block_size_i; v += block_size_i; } } else if (*op == '*') { // Iterate `(block_size_i, ncol)` submatrix blocks. for (i = 0; i < nrow; i += block_size_i) { // Set `A` and `C` to block beginning. A = A_block; C = C_block; // Get current block's row size. block_size_i = nrow - i; if (block_size_i > block_size) { block_size_i = block_size; } // Perform element wise operation for block. for (; A < A_end; A += nrow, C += nrow) { for (j = 0; j < block_size_i; ++j) { C[j] = A[j] * v[j]; // FUN = '*' } } // Step one block forth. A_block += block_size_i; C_block += block_size_i; v += block_size_i; } } else if (*op == '/') { // Iterate `(block_size_i, ncol)` submatrix blocks. for (i = 0; i < nrow; i += block_size_i) { // Set `A` and `C` to block beginning. A = A_block; C = C_block; // Get current block's row size. block_size_i = nrow - i; if (block_size_i > block_size) { block_size_i = block_size; } // Perform element wise operation for block. for (; A < A_end; A += nrow, C += nrow) { for (j = 0; j < block_size_i; ++j) { C[j] = A[j] / v[j]; // FUN = '/' } } // Step one block forth. A_block += block_size_i; C_block += block_size_i; v += block_size_i; } } else { error("Got unknown 'op' (opperation) argument."); } } static inline void matrixprod(const double *A, const int nrowA, const int ncolA, const double *B, const int nrowB, const int ncolB, double *C) { const double one = 1.0; const double zero = 0.0; // DGEMM with parameterization: // C <- A %*% B F77_NAME(dgemm)("N", "N", &nrowA, &ncolB, &ncolA, &one, A, &nrowA, B, &nrowB, &zero, C, &nrowA); } static inline void crossprod(const double *A, const int nrowA, const int ncolA, const double *B, const int nrowB, const int ncolB, double *C) { const double one = 1.0; const double zero = 0.0; // DGEMM with parameterization: // C <- t(A) %*% B F77_NAME(dgemm)("T", "N", &ncolA, &ncolB, &nrowA, &one, A, &nrowA, B, &nrowB, &zero, C, &ncolA); } static inline void nullProj(const double *B, const int nrowB, const int ncolB, double *Q) { const double minusOne = -1.0; const double one = 1.0; // Initialize as identity matrix. memset(Q, 0, sizeof(double) * nrowB * nrowB); double *Q_diag, *Q_end = Q + nrowB * nrowB; for (Q_diag = Q; Q_diag < Q_end; Q_diag += nrowB + 1) { *Q_diag = 1.0; } // DGEMM with parameterization: // C <- (-1.0 * B %*% t(B)) + C F77_NAME(dgemm)("N", "T", &nrowB, &nrowB, &ncolB, &minusOne, B, &nrowB, B, &nrowB, &one, Q, &nrowB); } static inline void rangePairs(const int from, const int to, int *pairs) { int i, j; for (i = from; i < to; ++i) { for (j = i + 1; j < to; ++j) { pairs[0] = i; pairs[1] = j; pairs += 2; } } } // A dence skwe-symmetric rank 2 update. // Perform the update // C := alpha (A * B^T - B * A^T) + beta C static void skewSymRank2k(const int nrow, const int ncol, double alpha, const double *A, const double *B, double beta, double *C) { F77_NAME(dgemm)("N", "T", &nrow, &nrow, &ncol, &alpha, A, &nrow, B, &nrow, &beta, C, &nrow); alpha *= -1.0; beta = 1.0; F77_NAME(dgemm)("N", "T", &nrow, &nrow, &ncol, &alpha, B, &nrow, A, &nrow, &beta, C, &nrow); } // TODO: mutch potential for optimization!!! static inline void weightedYandLoss(const int n, const double *Y, const double *vecD, const double *vecW, const double *colSums, double *y1, double *L, double *vecS, double *const loss) { int i, j, k, N = n * (n - 1) / 2; double l; for (i = 0; i < n; ++i) { y1[i] = Y[i] / colSums[i]; L[i] = Y[i] * Y[i] / colSums[i]; } for (k = j = 0; j < n; ++j) { for (i = j + 1; i < n; ++i, ++k) { y1[i] += Y[j] * vecW[k] / colSums[i]; y1[j] += Y[i] * vecW[k] / colSums[j]; L[i] += Y[j] * Y[j] * vecW[k] / colSums[i]; L[j] += Y[i] * Y[i] * vecW[k] / colSums[j]; } } l = 0.0; for (i = 0; i < n; ++i) { l += (L[i] -= y1[i] * y1[i]); } *loss = l / (double)n; for (k = j = 0; j < n; ++j) { for (i = j + 1; i < n; ++i, ++k) { l = Y[j] - y1[i]; vecS[k] = (L[i] - (l * l)) / colSums[i]; l = Y[i] - y1[j]; vecS[k] += (L[j] - (l * l)) / colSums[j]; } } for (k = 0; k < N; ++k) { vecS[k] *= vecW[k] * vecD[k]; } } inline double gaussKernel(const double x, const double scale) { return exp(scale * x * x); } static void gradient(const int n, const int p, const int q, const double *X, const double *X_diff, const double *Y, const double *V, const double h, double *G, double *const loss) { // Number of X_i to X_j not trivial pairs. int i, N = (n * (n - 1)) / 2; double scale = -0.5 / h; const double one = 1.0; if (X_diff == (void*)0) { // TODO: ... } // Allocate and compute projection matrix `Q = I_p - V * V^T` double *Q = (double*)malloc(p * p * sizeof(double)); nullProj(V, p, q, Q); // allocate and compute vectorized distance matrix with a temporary // projection of `X_diff`. double *vecD = (double*)malloc(N * sizeof(double)); double *X_proj; if (p < 5) { // TODO: refine that! X_proj = (double*)malloc(N * 5 * sizeof(double)); } else { X_proj = (double*)malloc(N * p * sizeof(double)); } matrixprod(X_diff, N, p, Q, p, p, X_proj); rowSquareSums(X_proj, N, p, vecD); // Apply kernel to distence vector for weights computation. double *vecW = X_proj; // reuse memory area, no longer needed. for (i = 0; i < N; ++i) { vecW[i] = gaussKernel(vecD[i], scale); } double *colSums = X_proj + N; // still allocated! rowSumsSymVec(vecW, n, &one, colSums); // rowSums = colSums cause Sym // compute weighted responces of first end second momontum, aka y1, y2. double *y1 = X_proj + N + n; double *L = X_proj + N + (2 * n); // Allocate X_diff scaling vector `vecS`, not in `X_proj` mem area because // used symultanious to X_proj in final gradient computation. double *vecS = (double*)malloc(N * sizeof(double)); weightedYandLoss(n, Y, vecD, vecW, colSums, y1, L, vecS, loss); // compute the gradient using X_proj for intermidiate scaled X_diff. rowSweep(X_diff, N, p, "*", vecS, X_proj); // reuse Q which has the required dim (p, p). crossprod(X_diff, N, p, X_proj, N, p, Q); // Product with V matrixprod(Q, p, p, V, p, q, G); // And final scaling (TODO: move into matrixprod!) scale = -2.0 / (((double)n) * h * h); N = p * q; for (i = 0; i < N; ++i) { G[i] *= scale; } free(vecS); free(X_proj); free(vecD); free(Q); }