#include #include // for `mem*` functions. #include #include #include // #include #include "benchmark.h" void rowSums(const double *A, const int nrow, const int ncol, double *sum) { int i, j, block_size, block_size_i; const double *A_block = A; const double *A_end = A + nrow * ncol; if (nrow > 508) { block_size = 508; } else { block_size = nrow; } // Iterate `(block_size_i, ncol)` submatrix blocks. for (i = 0; i < nrow; i += block_size_i) { // Reset `A` to new block beginning. A = A_block; // Take block size of eveything left and reduce to max size. block_size_i = nrow - i; if (block_size_i > block_size) { block_size_i = block_size; } // Copy blocks first column. for (j = 0; j < block_size_i; j += 4) { sum[j] = A[j]; sum[j + 1] = A[j + 1]; sum[j + 2] = A[j + 2]; sum[j + 3] = A[j + 3]; } for (; j < block_size_i; ++j) { sum[j] = A[j]; } // Sum following columns to the first one. for (A += nrow; A < A_end; A += nrow) { for (j = 0; j < block_size_i; j += 4) { sum[j] += A[j]; sum[j + 1] += A[j + 1]; sum[j + 2] += A[j + 2]; sum[j + 3] += A[j + 3]; } for (; j < block_size_i; ++j) { sum[j] += A[j]; } } // Step one block forth. A_block += block_size_i; sum += block_size_i; } } void rowSumsV2(const double *A, const int nrow, const int ncol, double *sum) { int i, j, block_size, block_size_i; const double *A_block = A; const double *A_end = A + nrow * ncol; if (nrow > CVE_MEM_CHUNK_SIZE) { block_size = CVE_MEM_CHUNK_SIZE; } else { block_size = nrow; } // Iterate `(block_size_i, ncol)` submatrix blocks. for (i = 0; i < nrow; i += block_size_i) { // Reset `A` to new block beginning. A = A_block; // Take block size of eveything left and reduce to max size. block_size_i = nrow - i; if (block_size_i > block_size) { block_size_i = block_size; } // Compute first blocks column, for (j = 0; j < block_size_i; ++j) { sum[j] = A[j]; } // and sum the following columns to the first one. for (A += nrow; A < A_end; A += nrow) { for (j = 0; j < block_size_i; ++j) { sum[j] += A[j]; } } // Step one block forth. A_block += block_size_i; sum += block_size_i; } } void rowSumsV3(const double *A, const int nrow, const int ncol, double *sum) { int i, onei = 1; double* ones = (double*)malloc(ncol * sizeof(double)); const double one = 1.0; const double zero = 0.0; for (i = 0; i < ncol; ++i) { ones[i] = 1.0; } matrixprod(A, nrow, ncol, ones, ncol, 1, sum); free(ones); } void colSums(const double *A, const int nrow, const int ncol, double *sums) { int i, j, nrowb = 4 * (nrow / 4); // 4 dividable nrow block, biggest 4*k <= nrow. double sum; for (j = 0; j < ncol; ++j) { sum = 0.0; for (i = 0; i < nrowb; i += 4) { sum += A[i] + A[i + 1] + A[i + 2] + A[i + 3]; } for (; i < nrow; ++i) { sum += A[i]; } *(sums++) = sum; A += nrow; } } void rowSquareSums(const double *A, const int nrow, const int ncol, double *sum) { int i, j, block_size, block_size_i; const double *A_block = A; const double *A_end = A + nrow * ncol; if (nrow > 508) { block_size = 508; } else { block_size = nrow; } // Iterate `(block_size_i, ncol)` submatrix blocks. for (i = 0; i < nrow; i += block_size_i) { // Reset `A` to new block beginning. A = A_block; // Take block size of eveything left and reduce to max size. block_size_i = nrow - i; if (block_size_i > block_size) { // TODO: contains BUG!!! floor last one !!! block_size_i = block_size; } /// ... // TODO: // Copy blocks first column. for (j = 0; j < block_size_i; j += 4) { sum[j] = A[j] * A[j]; sum[j + 1] = A[j + 1] * A[j + 1]; sum[j + 2] = A[j + 2] * A[j + 2]; sum[j + 3] = A[j + 3] * A[j + 3]; } for (; j < block_size_i; ++j) { sum[j] = A[j] * A[j]; } // Sum following columns to the first one. for (A += nrow; A < A_end; A += nrow) { for (j = 0; j < block_size_i; j += 4) { sum[j] += A[j] * A[j]; sum[j + 1] += A[j + 1] * A[j + 1]; sum[j + 2] += A[j + 2] * A[j + 2]; sum[j + 3] += A[j + 3] * A[j + 3]; } for (; j < block_size_i; ++j) { sum[j] += A[j] * A[j]; } } // Step one block forth. A_block += block_size_i; sum += block_size_i; } } void rowSumsSymVec(const double *Avec, const int nrow, const double diag, double *sum) { int i, j; if (diag == 0.0) { memset(sum, 0, nrow * sizeof(double)); } else { for (i = 0; i < nrow; ++i) { sum[i] = diag; } } for (j = 0; j < nrow; ++j) { for (i = j + 1; i < nrow; ++i, ++Avec) { sum[j] += *Avec; sum[i] += *Avec; } } } #define ROW_SWEEP_ALG(op) \ /* Iterate `(block_size_i, ncol)` submatrix blocks. */ \ for (i = 0; i < nrow; i += block_size_i) { \ /* Set `A` and `C` to block beginning. */ \ A = A_block; \ C = C_block; \ /* Get current block's row size. */ \ block_size_i = nrow - i; \ if (block_size_i > block_size) { \ block_size_i = block_size; \ } \ /* Perform element wise operation for block. */ \ for (; A < A_end; A += nrow, C += nrow) { \ for (j = 0; j < block_size_i; ++j) { \ C[j] = (A[j]) op (v[j]); \ } \ } \ /* Step one block forth. */ \ A_block += block_size_i; \ C_block += block_size_i; \ v += block_size_i; \ } /* C[, j] = A[, j] * v for each j = 1 to ncol */ void rowSweep(const double *A, const int nrow, const int ncol, const char* op, const double *v, // vector of length nrow double *C) { int i, j, block_size, block_size_i; const double *A_block = A; double *C_block = C; const double *A_end = A + nrow * ncol; if (nrow > CVE_MEM_CHUNK_SMALL) { // small because 3 vectors in cache block_size = CVE_MEM_CHUNK_SMALL; } else { block_size = nrow; } if (*op == '+') { ROW_SWEEP_ALG(+) } else if (*op == '-') { ROW_SWEEP_ALG(-) } else if (*op == '*') { ROW_SWEEP_ALG(*) } else if (*op == '/') { ROW_SWEEP_ALG(/) } else { error("Got unknown 'op' (opperation) argument."); } } void transpose(const double *A, const int nrow, const int ncol, double* T) { int i, j, len = nrow * ncol; // Filling column-wise and accessing row-wise. for (i = 0, j = 0; i < len; ++i, j += nrow) { if (j >= len) { j -= len - 1; } T[i] = A[j]; } } // Symmetric Packed matrix vector product. // Computes // y <- Ax // where A is supplied as packed lower triangular part of a symmetric // matrix A. Meaning that `vecA` is `vec_ltri(A)`. void sympMV(const double* vecA, const int nrow, const double* x, double* y) { double one = 1.0; double zero = 0.0; int onei = 1; F77_NAME(dspmv)("L", &nrow, &one, vecA, x, &onei, &zero, y, &onei); } void matrixprod(const double *A, const int nrowA, const int ncolA, const double *B, const int nrowB, const int ncolB, double *C) { const double one = 1.0; const double zero = 0.0; // DGEMM with parameterization: // C <- A %*% B F77_NAME(dgemm)("N", "N", &nrowA, &ncolB, &ncolA, &one, A, &nrowA, B, &nrowB, &zero, C, &nrowA); } void crossprod(const double *A, const int nrowA, const int ncolA, const double *B, const int nrowB, const int ncolB, double *C) { const double one = 1.0; const double zero = 0.0; // DGEMM with parameterization: // C <- t(A) %*% B F77_NAME(dgemm)("T", "N", &ncolA, &ncolB, &nrowA, &one, A, &nrowA, B, &nrowB, &zero, C, &ncolA); } #define KRONECKER_ALG(op) \ for (j = 0; j < ncolA; ++j) { \ for (l = 0; l < ncolB; ++l) { \ colB = B + (l * nrowB); \ for (i = 0; i < nrowA; ++i) { \ for (k = 0; k < nrowB; ++k) { \ *(C++) = (A[i]) op (colB[k]); \ } \ } \ } \ A += nrowA; \ } void kronecker(const double * restrict A, const int nrowA, const int ncolA, const double * restrict B, const int nrowB, const int ncolB, const char* op, double * restrict C) { int i, j, k, l; const double *colB; if (*op == '+') { KRONECKER_ALG(+) } else if (*op == '-') { KRONECKER_ALG(-) } else if (*op == '*') { KRONECKER_ALG(*) } else if (*op == '/') { KRONECKER_ALG(/) } else { error("Got unknown 'op' (opperation) argument."); } } void nullProj(const double *B, const int nrowB, const int ncolB, double *Q) { const double minusOne = -1.0; const double one = 1.0; // Initialize Q as identity matrix. memset(Q, 0, sizeof(double) * nrowB * nrowB); double *Q_diag, *Q_end = Q + nrowB * nrowB; for (Q_diag = Q; Q_diag < Q_end; Q_diag += nrowB + 1) { *Q_diag = 1.0; } // DGEMM with parameterization: // Q <- (-1.0 * B %*% t(B)) + Q F77_NAME(dgemm)("N", "T", &nrowB, &nrowB, &ncolB, &minusOne, B, &nrowB, B, &nrowB, &one, Q, &nrowB); } void rangePairs(const int from, const int to, int *pairs) { int i, j; for (i = from; i < to; ++i) { for (j = i + 1; j < to; ++j) { pairs[0] = i; pairs[1] = j; pairs += 2; } } } // A dence skwe-symmetric rank 2 update. // Perform the update // C := alpha (A * B^T - B * A^T) + beta C void skewSymRank2k(const int nrow, const int ncol, double alpha, const double *A, const double *B, double beta, double *C) { F77_NAME(dgemm)("N", "T", &nrow, &nrow, &ncol, &alpha, A, &nrow, B, &nrow, &beta, C, &nrow); alpha *= -1.0; beta = 1.0; F77_NAME(dgemm)("N", "T", &nrow, &nrow, &ncol, &alpha, B, &nrow, A, &nrow, &beta, C, &nrow); } // TODO: clarify static inline double gaussKernel(const double x, const double scale) { return exp(scale * x * x); } // TODO: mutch potential for optimization!!! static void weightedYandLoss(const int n, const double *Y, const double *vecD, const double *vecW, const double *colSums, double *y1, double *L, double *vecS, double *loss) { int i, j, k, N = n * (n - 1) / 2; double l; for (i = 0; i < n; ++i) { y1[i] = Y[i]; L[i] = Y[i] * Y[i]; } for (k = j = 0; j < n; ++j) { for (i = j + 1; i < n; ++i, ++k) { y1[i] += Y[j] * vecW[k]; y1[j] += Y[i] * vecW[k]; L[i] += Y[j] * Y[j] * vecW[k]; L[j] += Y[i] * Y[i] * vecW[k]; } } for (i = 0; i < n; ++i) { y1[i] /= colSums[i]; L[i] /= colSums[i]; } l = 0.0; for (i = 0; i < n; ++i) { l += (L[i] -= y1[i] * y1[i]); } *loss = l / (double)n; for (k = j = 0; j < n; ++j) { for (i = j + 1; i < n; ++i, ++k) { l = Y[j] - y1[i]; vecS[k] = (L[i] - (l * l)) / colSums[i]; l = Y[i] - y1[j]; vecS[k] += (L[j] - (l * l)) / colSums[j]; } } for (k = 0; k < N; ++k) { vecS[k] *= vecW[k] * vecD[k]; } } void grad(const int n, const int p, const int q, const double *X, const double *X_diff, const double *Y, const double *V, const double h, double *G, double *loss) { // Number of X_i to X_j not trivial pairs. int i, N = (n * (n - 1)) / 2; double scale = -0.5 / h; if (X_diff == (void*)0) { // TODO: ... } // Allocate and compute projection matrix `Q = I_p - V * V^T` double *Q = (double*)malloc(p * p * sizeof(double)); nullProj(V, p, q, Q); // allocate and compute vectorized distance matrix with a temporary // projection of `X_diff`. double *vecD = (double*)malloc(N * sizeof(double)); double *X_proj; if (p < 5) { // TODO: refine that! X_proj = (double*)malloc(N * 5 * sizeof(double)); } else { X_proj = (double*)malloc(N * p * sizeof(double)); } matrixprod(X_diff, N, p, Q, p, p, X_proj); rowSquareSums(X_proj, N, p, vecD); // Apply kernel to distence vector for weights computation. double *vecK = X_proj; // reuse memory area, no longer needed. for (i = 0; i < N; ++i) { vecK[i] = gaussKernel(vecD[i], scale); } double *colSums = X_proj + N; // still allocated! rowSumsSymVec(vecK, n, 1.0, colSums); // rowSums = colSums cause Sym // compute weighted responces of first end second momontum, aka y1, y2. double *y1 = X_proj + N + n; double *L = X_proj + N + (2 * n); // Allocate X_diff scaling vector `vecS`, not in `X_proj` mem area because // used symultanious to X_proj in final gradient computation. double *vecS = (double*)malloc(N * sizeof(double)); weightedYandLoss(n, Y, vecD, vecK, colSums, y1, L, vecS, loss); // compute the gradient using X_proj for intermidiate scaled X_diff. rowSweep(X_diff, N, p, "*", vecS, X_proj); // reuse Q which has the required dim (p, p). crossprod(X_diff, N, p, X_proj, N, p, Q); // Product with V matrixprod(Q, p, p, V, p, q, G); // And final scaling (TODO: move into matrixprod!) scale = -2.0 / (((double)n) * h * h); N = p * q; for (i = 0; i < N; ++i) { G[i] *= scale; } free(vecS); free(X_proj); free(vecD); free(Q); }