From 93dddda63e4345961a779ee125d748c1eeef4769 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Fri, 18 Sep 2015 17:46:41 +0200 Subject: Improved the organization and performance of level 2 routines --- src/kernels/level2/xgemv.opencl | 80 +++++++++++++++++++++++++++++++---------- 1 file changed, 61 insertions(+), 19 deletions(-) (limited to 'src/kernels/level2') diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl index 1e12dd78..0ecfc960 100644 --- a/src/kernels/level2/xgemv.opencl +++ b/src/kernels/level2/xgemv.opencl @@ -79,22 +79,61 @@ R"( #endif // ================================================================================================= -// Defines how to load the input matrix in the regular case -// Loads a scalar input value +// Defines how to load the input matrix in the non-vectorized case inline real LoadMatrixA(const __global real* restrict agm, const int x, const int y, - const int a_ld, const int a_offset) { - return agm[x + a_ld*y + a_offset]; + const int a_ld, const int a_offset, const int reversed, + const int kl, const int ku) { + real result; + + // For symmetric matrices + #if defined(ROUTINE_SYMV) + if ((reversed == 0 && y <= x) || (reversed == 1 && x <= y)) { + result = agm[y*a_ld + x + a_offset]; + } + else { + result = agm[x*a_ld + y + a_offset]; + } + + // For hermitian matrices + #elif defined(ROUTINE_HEMV) + if ((reversed == 0 && y <= x) || (reversed == 1 && x <= y)) { + result = agm[y*a_ld + x + a_offset]; + if (x == y) { result.y = ZERO; } + } + else { + result = agm[x*a_ld + y + a_offset]; + COMPLEX_CONJUGATE(result); + } + + // For banded matrices + #elif defined(ROUTINE_GBMV) + const int k = ku-y+x; + if (x >= y-ku && x < y+kl+1) { + result = agm[a_ld*y + k + a_offset]; + } + else { + SetToZero(result); + } + + // For general matrices + #else + result = agm[a_ld*y + x + a_offset]; + #endif + + return result; } + // Loads a vector input value (1/2) inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y, const int a_ld) { - return agm[x + a_ld*y]; + return agm[a_ld*y + x]; } + // Loads a vector input value (2/2): as before, but different data-type inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y, const int a_ld) { - return agm[x + a_ld*y]; + return agm[a_ld*y + x]; } // ================================================================================================= @@ -106,7 +145,8 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, const __global real* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, - const int do_conjugate) { + const int do_conjugate, const int reversed, + const int kl, const int ku) { // Local memory for the vector X __local real xlm[WGS1]; @@ -141,20 +181,20 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, // The multiply-add function for the main part (divisable by WGS1) if (a_rotated == 0) { // Not rotated #pragma unroll - for (int kl=0; kl