summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/kernels/xgemv.opencl342
-rw-r--r--src/routines/xgemv.cc36
-rw-r--r--src/tuning/tuning.cc54
-rw-r--r--src/tuning/xgemv.cc62
4 files changed, 306 insertions, 188 deletions
diff --git a/src/kernels/xgemv.opencl b/src/kernels/xgemv.opencl
index 26e7587f..b1b2fc69 100644
--- a/src/kernels/xgemv.opencl
+++ b/src/kernels/xgemv.opencl
@@ -19,42 +19,63 @@ R"(
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
-#ifndef WGS
- #define WGS 64 // The local work-group size
+
+// 1: For the full version of the kernel
+#ifndef WGS1
+ #define WGS1 64 // The local work-group size
+#endif
+#ifndef WPT1
+ #define WPT1 1 // The amount of work-per-thread
+#endif
+
+// 2: For the fast version
+#ifndef WGS2
+ #define WGS2 64 // The local work-group size
+#endif
+#ifndef WPT2
+ #define WPT2 1 // The amount of work-per-thread
+#endif
+#ifndef VW2
+ #define VW2 1 // Vector width of matrix A loads
#endif
-#ifndef WPT
- #define WPT 1 // The amount of work-per-thread
+
+// 3: For the fast rotated version
+#ifndef WGS3
+ #define WGS3 64 // The local work-group size
+#endif
+#ifndef WPT3
+ #define WPT3 1 // The amount of work-per-thread
#endif
-#ifndef VW
- #define VW 1 // Vector width of matrix A loads (only for the fast kernel)
+#ifndef VW3
+ #define VW3 1 // Vector width of matrix A loads
#endif
// =================================================================================================
// Full version of the kernel
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__attribute__((reqd_work_group_size(WGS1, 1, 1)))
__kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
- const int a_transposed,
+ const int a_rotated,
const __global real* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
// Local memory for the vector X
- __local real xlm[WGS];
+ __local real xlm[WGS1];
// Initializes the accumulation register
- real acc[WPT];
+ real acc[WPT1];
#pragma unroll
- for (int w=0; w<WPT; ++w) {
+ for (int w=0; w<WPT1; ++w) {
SetToZero(acc[w]);
}
// Divides the work in a main and tail section
- const int n_tail = n % WGS;
+ const int n_tail = n % WGS1;
const int n_floor = n - n_tail;
// Loops over work-group sized portions of the work
- for (int kwg=0; kwg<n_floor; kwg+=WGS) {
+ for (int kwg=0; kwg<n_floor; kwg+=WGS1) {
// Loads the vector X into local memory
const int lid = get_local_id(0);
@@ -65,21 +86,21 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
// Loops over the work per thread, and checks whether in bounds
#pragma unroll
- for (int w=0; w<WPT; ++w) {
+ for (int w=0; w<WPT1; ++w) {
const int gid = w*get_global_size(0) + get_global_id(0);
if (gid < m) {
- // The multiply-add function for the main part (divisable by WGS)
- if (a_transposed == 0) { // Not transposed
+ // The multiply-add function for the main part (divisable by WGS1)
+ if (a_rotated == 0) { // Not rotated
#pragma unroll
- for (int kl=0; kl<WGS; ++kl) {
+ for (int kl=0; kl<WGS1; ++kl) {
const int k = kwg + kl;
MultiplyAdd(acc[w], xlm[kl], agm[gid + a_ld*k + a_offset]);
}
}
else { // Transposed
#pragma unroll
- for (int kl=0; kl<WGS; ++kl) {
+ for (int kl=0; kl<WGS1; ++kl) {
const int k = kwg + kl;
MultiplyAdd(acc[w], xlm[kl], agm[k + a_ld*gid + a_offset]);
}
@@ -93,12 +114,12 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
// Loops over the work per thread, and checks whether in bounds
#pragma unroll
- for (int w=0; w<WPT; ++w) {
+ for (int w=0; w<WPT1; ++w) {
const int gid = w*get_global_size(0) + get_global_id(0);
if (gid < m) {
- // The multiply-add function for the remainder part (not divisable by WGS)
- if (a_transposed == 0) { // Not transposed
+ // The multiply-add function for the remainder part (not divisable by WGS1)
+ if (a_rotated == 0) { // Not rotated
#pragma unroll
for (int k=n_floor; k<n; ++k) {
MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], agm[gid + a_ld*k + a_offset]);
@@ -120,40 +141,41 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
// =================================================================================================
// Data-widths for the 'fast' kernel
-#if VW == 1
- typedef real realV;
-#elif VW == 2
- typedef real2 realV;
-#elif VW == 4
- typedef real4 realV;
-#elif VW == 8
- typedef real8 realV;
-#elif VW == 16
- typedef real16 realV;
+#if VW2 == 1
+ typedef real realVF;
+#elif VW2 == 2
+ typedef real2 realVF;
+#elif VW2 == 4
+ typedef real4 realVF;
+#elif VW2 == 8
+ typedef real8 realVF;
+#elif VW2 == 16
+ typedef real16 realVF;
#endif
// Faster version of the kernel, assuming that:
-// --> 'm' and 'n' are multiples of WGS
+// --> 'm' and 'n' are multiples of WGS2
// --> 'a_offset' is 0
-// --> 'a_ld' is a multiple of VW
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
+// --> 'a_ld' is a multiple of VW2
+// --> 'a_rotated' is 0
+__attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void XgemvFast(const int m, const int n, const real alpha, const real beta,
- const int a_transposed,
- const __global realV* restrict agm, const int a_offset, const int a_ld,
+ const int a_rotated,
+ const __global realVF* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
// Local memory for the vector X
- __local real xlm[WGS];
+ __local real xlm[WGS2];
// Initializes the accumulation register
- real acc[WPT];
+ real acc[WPT2];
#pragma unroll
- for (int w=0; w<WPT; ++w) {
+ for (int w=0; w<WPT2; ++w) {
SetToZero(acc[w]);
}
// Loops over work-group sized portions of the work
- for (int kwg=0; kwg<n; kwg+=WGS) {
+ for (int kwg=0; kwg<n; kwg+=WGS2) {
// Loads the vector X into local memory
const int lid = get_local_id(0);
@@ -162,102 +184,156 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
- // The multiply-add function (not transposed)
- if (a_transposed == 0) {
+ // The multiply-add function (not rotated)
+ #pragma unroll
+ for (int kl=0; kl<WGS2; ++kl) {
+ const int k = kwg + kl;
#pragma unroll
- for (int kl=0; kl<WGS; ++kl) {
- const int k = kwg + kl;
- #pragma unroll
- for (int w=0; w<WPT/VW; ++w) {
- const int gid = (WPT/VW)*get_global_id(0) + w;
- #if VW == 1
- MultiplyAdd(acc[VW*w+0], xlm[kl], agm[gid + (a_ld/VW)*k]);
- #elif VW == 2
- MultiplyAdd(acc[VW*w+0], xlm[kl], agm[gid + (a_ld/VW)*k].x);
- MultiplyAdd(acc[VW*w+1], xlm[kl], agm[gid + (a_ld/VW)*k].y);
- #elif VW == 4
- MultiplyAdd(acc[VW*w+0], xlm[kl], agm[gid + (a_ld/VW)*k].x);
- MultiplyAdd(acc[VW*w+1], xlm[kl], agm[gid + (a_ld/VW)*k].y);
- MultiplyAdd(acc[VW*w+2], xlm[kl], agm[gid + (a_ld/VW)*k].z);
- MultiplyAdd(acc[VW*w+3], xlm[kl], agm[gid + (a_ld/VW)*k].w);
- #elif VW == 8
- MultiplyAdd(acc[VW*w+0], xlm[kl], agm[gid + (a_ld/VW)*k].s0);
- MultiplyAdd(acc[VW*w+1], xlm[kl], agm[gid + (a_ld/VW)*k].s1);
- MultiplyAdd(acc[VW*w+2], xlm[kl], agm[gid + (a_ld/VW)*k].s2);
- MultiplyAdd(acc[VW*w+3], xlm[kl], agm[gid + (a_ld/VW)*k].s3);
- MultiplyAdd(acc[VW*w+4], xlm[kl], agm[gid + (a_ld/VW)*k].s4);
- MultiplyAdd(acc[VW*w+5], xlm[kl], agm[gid + (a_ld/VW)*k].s5);
- MultiplyAdd(acc[VW*w+6], xlm[kl], agm[gid + (a_ld/VW)*k].s6);
- MultiplyAdd(acc[VW*w+7], xlm[kl], agm[gid + (a_ld/VW)*k].s7);
- #elif VW == 16
- MultiplyAdd(acc[VW*w+0], xlm[kl], agm[gid + (a_ld/VW)*k].s0);
- MultiplyAdd(acc[VW*w+1], xlm[kl], agm[gid + (a_ld/VW)*k].s1);
- MultiplyAdd(acc[VW*w+2], xlm[kl], agm[gid + (a_ld/VW)*k].s2);
- MultiplyAdd(acc[VW*w+3], xlm[kl], agm[gid + (a_ld/VW)*k].s3);
- MultiplyAdd(acc[VW*w+4], xlm[kl], agm[gid + (a_ld/VW)*k].s4);
- MultiplyAdd(acc[VW*w+5], xlm[kl], agm[gid + (a_ld/VW)*k].s5);
- MultiplyAdd(acc[VW*w+6], xlm[kl], agm[gid + (a_ld/VW)*k].s6);
- MultiplyAdd(acc[VW*w+7], xlm[kl], agm[gid + (a_ld/VW)*k].s7);
- MultiplyAdd(acc[VW*w+8], xlm[kl], agm[gid + (a_ld/VW)*k].s8);
- MultiplyAdd(acc[VW*w+9], xlm[kl], agm[gid + (a_ld/VW)*k].s9);
- MultiplyAdd(acc[VW*w+10], xlm[kl], agm[gid + (a_ld/VW)*k].sA);
- MultiplyAdd(acc[VW*w+11], xlm[kl], agm[gid + (a_ld/VW)*k].sB);
- MultiplyAdd(acc[VW*w+12], xlm[kl], agm[gid + (a_ld/VW)*k].sC);
- MultiplyAdd(acc[VW*w+13], xlm[kl], agm[gid + (a_ld/VW)*k].sD);
- MultiplyAdd(acc[VW*w+14], xlm[kl], agm[gid + (a_ld/VW)*k].sE);
- MultiplyAdd(acc[VW*w+15], xlm[kl], agm[gid + (a_ld/VW)*k].sF);
- #endif
- }
+ for (int w=0; w<WPT2/VW2; ++w) {
+ const int gid = (WPT2/VW2)*get_global_id(0) + w;
+ #if VW2 == 1
+ MultiplyAdd(acc[VW2*w+0], xlm[kl], agm[gid + (a_ld/VW2)*k]);
+ #elif VW2 == 2
+ MultiplyAdd(acc[VW2*w+0], xlm[kl], agm[gid + (a_ld/VW2)*k].x);
+ MultiplyAdd(acc[VW2*w+1], xlm[kl], agm[gid + (a_ld/VW2)*k].y);
+ #elif VW2 == 4
+ MultiplyAdd(acc[VW2*w+0], xlm[kl], agm[gid + (a_ld/VW2)*k].x);
+ MultiplyAdd(acc[VW2*w+1], xlm[kl], agm[gid + (a_ld/VW2)*k].y);
+ MultiplyAdd(acc[VW2*w+2], xlm[kl], agm[gid + (a_ld/VW2)*k].z);
+ MultiplyAdd(acc[VW2*w+3], xlm[kl], agm[gid + (a_ld/VW2)*k].w);
+ #elif VW2 == 8
+ MultiplyAdd(acc[VW2*w+0], xlm[kl], agm[gid + (a_ld/VW2)*k].s0);
+ MultiplyAdd(acc[VW2*w+1], xlm[kl], agm[gid + (a_ld/VW2)*k].s1);
+ MultiplyAdd(acc[VW2*w+2], xlm[kl], agm[gid + (a_ld/VW2)*k].s2);
+ MultiplyAdd(acc[VW2*w+3], xlm[kl], agm[gid + (a_ld/VW2)*k].s3);
+ MultiplyAdd(acc[VW2*w+4], xlm[kl], agm[gid + (a_ld/VW2)*k].s4);
+ MultiplyAdd(acc[VW2*w+5], xlm[kl], agm[gid + (a_ld/VW2)*k].s5);
+ MultiplyAdd(acc[VW2*w+6], xlm[kl], agm[gid + (a_ld/VW2)*k].s6);
+ MultiplyAdd(acc[VW2*w+7], xlm[kl], agm[gid + (a_ld/VW2)*k].s7);
+ #elif VW2 == 16
+ MultiplyAdd(acc[VW2*w+0], xlm[kl], agm[gid + (a_ld/VW2)*k].s0);
+ MultiplyAdd(acc[VW2*w+1], xlm[kl], agm[gid + (a_ld/VW2)*k].s1);
+ MultiplyAdd(acc[VW2*w+2], xlm[kl], agm[gid + (a_ld/VW2)*k].s2);
+ MultiplyAdd(acc[VW2*w+3], xlm[kl], agm[gid + (a_ld/VW2)*k].s3);
+ MultiplyAdd(acc[VW2*w+4], xlm[kl], agm[gid + (a_ld/VW2)*k].s4);
+ MultiplyAdd(acc[VW2*w+5], xlm[kl], agm[gid + (a_ld/VW2)*k].s5);
+ MultiplyAdd(acc[VW2*w+6], xlm[kl], agm[gid + (a_ld/VW2)*k].s6);
+ MultiplyAdd(acc[VW2*w+7], xlm[kl], agm[gid + (a_ld/VW2)*k].s7);
+ MultiplyAdd(acc[VW2*w+8], xlm[kl], agm[gid + (a_ld/VW2)*k].s8);
+ MultiplyAdd(acc[VW2*w+9], xlm[kl], agm[gid + (a_ld/VW2)*k].s9);
+ MultiplyAdd(acc[VW2*w+10], xlm[kl], agm[gid + (a_ld/VW2)*k].sA);
+ MultiplyAdd(acc[VW2*w+11], xlm[kl], agm[gid + (a_ld/VW2)*k].sB);
+ MultiplyAdd(acc[VW2*w+12], xlm[kl], agm[gid + (a_ld/VW2)*k].sC);
+ MultiplyAdd(acc[VW2*w+13], xlm[kl], agm[gid + (a_ld/VW2)*k].sD);
+ MultiplyAdd(acc[VW2*w+14], xlm[kl], agm[gid + (a_ld/VW2)*k].sE);
+ MultiplyAdd(acc[VW2*w+15], xlm[kl], agm[gid + (a_ld/VW2)*k].sF);
+ #endif
}
}
- // The multiply-add function (transposed)
- else {
+ // Synchronizes all threads in a workgroup
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ // Stores the final result
+ #pragma unroll
+ for (int w=0; w<WPT2; ++w) {
+ const int gid = WPT2*get_global_id(0) + w;
+ AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, ygm[gid*y_inc + y_offset]);
+ }
+}
+
+// =================================================================================================
+
+// Data-widths for the 'fast' kernel with rotated matrix
+#if VW3 == 1
+ typedef real realVFR;
+#elif VW3 == 2
+ typedef real2 realVFR;
+#elif VW3 == 4
+ typedef real4 realVFR;
+#elif VW3 == 8
+ typedef real8 realVFR;
+#elif VW3 == 16
+ typedef real16 realVFR;
+#endif
+
+// Faster version of the kernel, assuming that:
+// --> 'm' and 'n' are multiples of WGS3
+// --> 'a_offset' is 0
+// --> 'a_ld' is a multiple of VW3
+// --> 'a_rotated' is 1
+__attribute__((reqd_work_group_size(WGS3, 1, 1)))
+__kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta,
+ const int a_rotated,
+ const __global realVFR* restrict agm, const int a_offset, const int a_ld,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ __global real* ygm, const int y_offset, const int y_inc) {
+ // Local memory for the vector X
+ __local real xlm[WGS3];
+
+ // Initializes the accumulation register
+ real acc[WPT3];
+ #pragma unroll
+ for (int w=0; w<WPT3; ++w) {
+ SetToZero(acc[w]);
+ }
+
+ // Loops over work-group sized portions of the work
+ for (int kwg=0; kwg<n; kwg+=WGS3) {
+
+ // Loads the vector X into local memory
+ const int lid = get_local_id(0);
+ xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
+
+ // Synchronizes all threads in a workgroup
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // The multiply-add function (rotated)
+ #pragma unroll
+ for (int kl=0; kl<WGS3/VW3; ++kl) {
+ const int k = (kwg/VW3) + kl;
#pragma unroll
- for (int kl=0; kl<WGS/VW; ++kl) {
- const int k = (kwg/VW) + kl;
- #pragma unroll
- for (int w=0; w<WPT; ++w) {
- const int gid = WPT*get_global_id(0) + w;
- realV avec = agm[k + (a_ld/VW)*gid];
- #if VW == 1
- MultiplyAdd(acc[w], xlm[VW*kl+0], avec);
- #elif VW == 2
- MultiplyAdd(acc[w], xlm[VW*kl+0], avec.x);
- MultiplyAdd(acc[w], xlm[VW*kl+1], avec.y);
- #elif VW == 4
- MultiplyAdd(acc[w], xlm[VW*kl+0], avec.x);
- MultiplyAdd(acc[w], xlm[VW*kl+1], avec.y);
- MultiplyAdd(acc[w], xlm[VW*kl+2], avec.z);
- MultiplyAdd(acc[w], xlm[VW*kl+3], avec.w);
- #elif VW == 8
- MultiplyAdd(acc[w], xlm[VW*kl+0], avec.s0);
- MultiplyAdd(acc[w], xlm[VW*kl+1], avec.s1);
- MultiplyAdd(acc[w], xlm[VW*kl+2], avec.s2);
- MultiplyAdd(acc[w], xlm[VW*kl+3], avec.s3);
- MultiplyAdd(acc[w], xlm[VW*kl+4], avec.s4);
- MultiplyAdd(acc[w], xlm[VW*kl+5], avec.s5);
- MultiplyAdd(acc[w], xlm[VW*kl+6], avec.s6);
- MultiplyAdd(acc[w], xlm[VW*kl+7], avec.s7);
- #elif VW == 16
- MultiplyAdd(acc[w], xlm[VW*kl+0], avec.s0);
- MultiplyAdd(acc[w], xlm[VW*kl+1], avec.s1);
- MultiplyAdd(acc[w], xlm[VW*kl+2], avec.s2);
- MultiplyAdd(acc[w], xlm[VW*kl+3], avec.s3);
- MultiplyAdd(acc[w], xlm[VW*kl+4], avec.s4);
- MultiplyAdd(acc[w], xlm[VW*kl+5], avec.s5);
- MultiplyAdd(acc[w], xlm[VW*kl+6], avec.s6);
- MultiplyAdd(acc[w], xlm[VW*kl+7], avec.s7);
- MultiplyAdd(acc[w], xlm[VW*kl+8], avec.s8);
- MultiplyAdd(acc[w], xlm[VW*kl+9], avec.s9);
- MultiplyAdd(acc[w], xlm[VW*kl+10], avec.sA);
- MultiplyAdd(acc[w], xlm[VW*kl+11], avec.sB);
- MultiplyAdd(acc[w], xlm[VW*kl+12], avec.sC);
- MultiplyAdd(acc[w], xlm[VW*kl+13], avec.sD);
- MultiplyAdd(acc[w], xlm[VW*kl+14], avec.sE);
- MultiplyAdd(acc[w], xlm[VW*kl+15], avec.sF);
- #endif
- }
+ for (int w=0; w<WPT3; ++w) {
+ const int gid = WPT3*get_global_id(0) + w;
+ realVFR avec = agm[k + (a_ld/VW3)*gid];
+ #if VW3 == 1
+ MultiplyAdd(acc[w], xlm[VW3*kl+0], avec);
+ #elif VW3 == 2
+ MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
+ MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
+ #elif VW3 == 4
+ MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
+ MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
+ MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.z);
+ MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.w);
+ #elif VW3 == 8
+ MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
+ MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
+ MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
+ MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
+ MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
+ MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
+ MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
+ MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
+ #elif VW3 == 16
+ MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
+ MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
+ MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
+ MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
+ MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
+ MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
+ MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
+ MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
+ MultiplyAdd(acc[w], xlm[VW3*kl+8], avec.s8);
+ MultiplyAdd(acc[w], xlm[VW3*kl+9], avec.s9);
+ MultiplyAdd(acc[w], xlm[VW3*kl+10], avec.sA);
+ MultiplyAdd(acc[w], xlm[VW3*kl+11], avec.sB);
+ MultiplyAdd(acc[w], xlm[VW3*kl+12], avec.sC);
+ MultiplyAdd(acc[w], xlm[VW3*kl+13], avec.sD);
+ MultiplyAdd(acc[w], xlm[VW3*kl+14], avec.sE);
+ MultiplyAdd(acc[w], xlm[VW3*kl+15], avec.sF);
+ #endif
}
}
@@ -267,8 +343,8 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b
// Stores the final result
#pragma unroll
- for (int w=0; w<WPT; ++w) {
- const int gid = WPT*get_global_id(0) + w;
+ for (int w=0; w<WPT3; ++w) {
+ const int gid = WPT3*get_global_id(0) + w;
AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, ygm[gid*y_inc + y_offset]);
}
}
diff --git a/src/routines/xgemv.cc b/src/routines/xgemv.cc
index 74851ec9..9f3908f8 100644
--- a/src/routines/xgemv.cc
+++ b/src/routines/xgemv.cc
@@ -70,13 +70,30 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
if (ErrorIn(status)) { return status; }
// Determines whether or not the fast-version can be used
- bool use_fast_kernel = (a_offset == 0) &&
- IsMultiple(m, db_["WGS"]*db_["WPT"]) &&
- IsMultiple(n, db_["WGS"]) &&
- IsMultiple(a_ld, db_["VW"]);
-
- // If possible, run the fast-version of the kernel
- auto kernel_name = (use_fast_kernel) ? "XgemvFast" : "Xgemv";
+ bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) &&
+ IsMultiple(m, db_["WGS2"]*db_["WPT2"]) &&
+ IsMultiple(n, db_["WGS2"]) &&
+ IsMultiple(a_ld, db_["VW2"]);
+ bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) &&
+ IsMultiple(m, db_["WGS3"]*db_["WPT3"]) &&
+ IsMultiple(n, db_["WGS3"]) &&
+ IsMultiple(a_ld, db_["VW3"]);
+
+ // If possible, run the fast-version (rotated or non-rotated) of the kernel
+ auto kernel_name = "Xgemv";
+ auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
+ auto global_size = m_ceiled / db_["WPT1"];
+ auto local_size = db_["WGS1"];
+ if (use_fast_kernel) {
+ kernel_name = "XgemvFast";
+ global_size = m_real / db_["WPT2"];
+ local_size = db_["WGS2"];
+ }
+ if (use_fast_kernel_rot) {
+ kernel_name = "XgemvFastRot";
+ global_size = m_real / db_["WPT3"];
+ local_size = db_["WGS3"];
+ }
// Retrieves the Xgemv kernel from the compiled binary
try {
@@ -100,9 +117,8 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
kernel.SetArgument(13, static_cast<int>(y_inc));
// Launches the kernel
- auto m_ceiled = Ceil(m_real, db_["WGS"]*db_["WPT"]);
- auto global = std::vector<size_t>{m_ceiled / db_["WPT"]};
- auto local = std::vector<size_t>{db_["WGS"]};
+ auto global = std::vector<size_t>{global_size};
+ auto local = std::vector<size_t>{local_size};
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
diff --git a/src/tuning/tuning.cc b/src/tuning/tuning.cc
index d617af88..2dcb11d5 100644
--- a/src/tuning/tuning.cc
+++ b/src/tuning/tuning.cc
@@ -75,7 +75,8 @@ template void TunerXY<double2>(int, char**, const Tuner2<double2>&);
// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
// the results. Used for matrix-vector-vector routines.
template <typename T>
-void TunerAXY(int argc, char* argv[], const Tuner3<T> &tune_function) {
+void TunerAXY(int argc, char* argv[], const size_t num_variations,
+ const Tuner3V<T> &tune_function) {
// Sets the parameters and platform/device for which to tune (command-line options)
auto help = std::string{"* Options given/available:\n"};
@@ -83,11 +84,10 @@ void TunerAXY(int argc, char* argv[], const Tuner3<T> &tune_function) {
args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
- args.m = GetArgument(argc, argv, help, kArgM, size_t{1024});
- args.n = GetArgument(argc, argv, help, kArgN, size_t{1024});
+ args.m = GetArgument(argc, argv, help, kArgM, size_t{2048});
+ args.n = GetArgument(argc, argv, help, kArgN, size_t{2048});
args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>());
args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>());
- args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kColMajor);
fprintf(stdout, "%s\n", help.c_str());
// Creates input buffers with random data
@@ -98,36 +98,40 @@ void TunerAXY(int argc, char* argv[], const Tuner3<T> &tune_function) {
PopulateVector(x_vec);
PopulateVector(y_vec);
- // Initializes the tuner for the chosen device
- cltune::Tuner tuner(args.platform_id, args.device_id);
+ // Loop over the different variations of the kernel
+ for (auto variation=size_t{1}; variation<=num_variations; ++variation) {
- // Use full-search to explore all parameter combinations.
- tuner.UseFullSearch();
+ // Initializes the tuner for the chosen device
+ cltune::Tuner tuner(args.platform_id, args.device_id);
- // Configures the tuning parameters (kernel specific)
- tune_function(args, a_mat, x_vec, y_vec, tuner);
+ // Use full-search to explore all parameter combinations.
+ tuner.UseFullSearch();
- // Starts the tuning process
- tuner.Tune();
+ // Configures the tuning parameters (kernel specific)
+ tune_function(args, variation, a_mat, x_vec, y_vec, tuner);
- // Prints the results to screen
- auto time_ms = tuner.PrintToScreen();
- tuner.PrintFormatted();
+ // Starts the tuning process
+ tuner.Tune();
- // Also prints the performance of the best-case in terms of GB/s and GFLOPS
- const auto mega_bytes = ((args.m*args.n + 2*args.m + args.n)*GetBytes(args.precision)) * 1.0e-6;
- const auto mega_flops = (2*args.m*args.n) * 1.0e-6;
- if (time_ms != 0.0) {
- printf("[ -------> ] %.1lf ms or %.1lf GB/s or %.1lf GFLOPS\n",
- time_ms, mega_bytes/time_ms, mega_flops/time_ms);
+ // Prints the results to screen
+ auto time_ms = tuner.PrintToScreen();
+ tuner.PrintFormatted();
+
+ // Also prints the performance of the best-case in terms of GB/s and GFLOPS
+ const auto mega_bytes = ((args.m*args.n + 2*args.m + args.n)*GetBytes(args.precision)) * 1.0e-6;
+ const auto mega_flops = (2*args.m*args.n) * 1.0e-6;
+ if (time_ms != 0.0) {
+ printf("[ -------> ] %.1lf ms or %.1lf GB/s or %.1lf GFLOPS\n",
+ time_ms, mega_bytes/time_ms, mega_flops/time_ms);
+ }
}
}
// Compiles the above function
-template void TunerAXY<float>(int, char**, const Tuner3<float>&);
-template void TunerAXY<double>(int, char**, const Tuner3<double>&);
-template void TunerAXY<float2>(int, char**, const Tuner3<float2>&);
-template void TunerAXY<double2>(int, char**, const Tuner3<double2>&);
+template void TunerAXY<float>(int, char**, const size_t, const Tuner3V<float>&);
+template void TunerAXY<double>(int, char**, const size_t, const Tuner3V<double>&);
+template void TunerAXY<float2>(int, char**, const size_t, const Tuner3V<float2>&);
+template void TunerAXY<double2>(int, char**, const size_t, const Tuner3V<double2>&);
// =================================================================================================
diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc
index e2d54729..dccd250c 100644
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@@ -8,6 +8,10 @@
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements an auto-tuner to tune the Xgemv OpenCL kernel. It uses the CLTune library.
+// Three variations of the kernel are tuned:
+// 1: The full version of the kernel
+// 2: The fast version for non-transposed matrices
+// 3: The fast version for transposed matrices
//
// =================================================================================================
@@ -23,43 +27,60 @@ namespace clblast {
// The Xgemv auto-tuner
template <typename T>
-void XgemvTune(const Arguments<T> &args,
+void XgemvTune(const Arguments<T> &args, const size_t variation,
const std::vector<T> &a_mat, const std::vector<T> &x_vec, std::vector<T> &y_vec,
cltune::Tuner &tuner) {
+ // Sets the kernel name and the layout argument
+ auto kernel_name = (variation == 1) ? "Xgemv" : ((variation == 2) ? "XgemvFast" : "XgemvFastRot");
+ auto a_rotated = (variation == 3) ? 1 : 0;
+
// This points to the Xgemv kernel as found in the CLBlast library
std::string common_source =
#include "../src/kernels/common.opencl"
std::string kernel_source =
#include "../src/kernels/xgemv.opencl"
auto sources = common_source + kernel_source;
- auto id = tuner.AddKernelFromString(sources, "XgemvFast", {args.m}, {1});
+ auto id = tuner.AddKernelFromString(sources, kernel_name, {args.m}, {1});
tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64});
- // Sets the tunable parameters and their possible values
- tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 1536, 2048});
- tuner.AddParameter(id, "WPT", {1, 2, 4, 8});
- tuner.AddParameter(id, "VW", {1, 2, 4, 8});
+ // Helper for the constraints
+ auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
+
+ // Sets the tunable parameters, their possible values, the adjusted thread sizes, and constraints
+ if (variation == 1) {
+ tuner.AddParameter(id, "WGS1", {64, 128, 256, 512, 1024, 1536, 2048});
+ tuner.AddParameter(id, "WPT1", {1, 2, 4, 8});
+ tuner.MulLocalSize(id, {"WGS1"});
+ tuner.DivGlobalSize(id, {"WPT1"});
+ }
+ else if (variation == 2) {
+ tuner.AddParameter(id, "WGS2", {64, 128, 256, 512, 1024, 1536, 2048});
+ tuner.AddParameter(id, "WPT2", {1, 2, 4, 8});
+ tuner.AddParameter(id, "VW2", {1, 2, 4, 8});
+ tuner.MulLocalSize(id, {"WGS2"});
+ tuner.DivGlobalSize(id, {"WPT2"});
+ tuner.AddConstraint(id, MultipleOfX, {"WPT2", "VW2"});
+ }
+ else if (variation == 3) {
+ tuner.AddParameter(id, "WGS3", {64, 128, 256, 512, 1024, 1536, 2048});
+ tuner.AddParameter(id, "WPT3", {1, 2, 4, 8});
+ tuner.AddParameter(id, "VW3", {1, 2, 4, 8});
+ tuner.MulLocalSize(id, {"WGS3"});
+ tuner.DivGlobalSize(id, {"WPT3"});
+ tuner.AddConstraint(id, MultipleOfX, {"WGS3", "VW3"});
+ }
// Tests for a specific precision
tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
- // Sets the constraints
- auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
- tuner.AddConstraint(id, MultipleOfX, {"WGS", "VW"});
- tuner.AddConstraint(id, MultipleOfX, {"WPT", "VW"});
-
- // Modifies the thread-sizes (local) based on the parameters
- tuner.MulLocalSize(id, {"WGS"});
- tuner.DivGlobalSize(id, {"WPT"});
-
// Sets the function's arguments
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(args.alpha);
tuner.AddArgumentScalar(args.beta);
- tuner.AddArgumentScalar(static_cast<int>(args.layout));
+ tuner.AddArgumentScalar(static_cast<int>(a_rotated));
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentScalar(0);
tuner.AddArgumentScalar(static_cast<int>(args.m));
@@ -75,12 +96,13 @@ void XgemvTune(const Arguments<T> &args,
// Main function which calls the common client code with the routine-specific function as argument.
void TunerXgemv(int argc, char *argv[]) {
+ auto num_variations = size_t{3};
switch(GetPrecision(argc, argv)) {
case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
- case Precision::kSingle: TunerAXY<float>(argc, argv, XgemvTune<float>); break;
- case Precision::kDouble: TunerAXY<double>(argc, argv, XgemvTune<double>); break;
- case Precision::kComplexSingle: TunerAXY<float2>(argc, argv, XgemvTune<float2>); break;
- case Precision::kComplexDouble: TunerAXY<double2>(argc, argv, XgemvTune<double2>); break;
+ case Precision::kSingle: TunerAXY<float>(argc, argv, num_variations, XgemvTune<float>); break;
+ case Precision::kDouble: TunerAXY<double>(argc, argv, num_variations, XgemvTune<double>); break;
+ case Precision::kComplexSingle: TunerAXY<float2>(argc, argv, num_variations, XgemvTune<float2>); break;
+ case Precision::kComplexDouble: TunerAXY<double2>(argc, argv, num_variations, XgemvTune<double2>); break;
}
}