summaryrefslogtreecommitdiff
path: root/src/kernels
diff options
context:
space:
mode:
authorCNugteren <web@cedricnugteren.nl>2015-05-30 12:30:43 +0200
committerCNugteren <web@cedricnugteren.nl>2015-05-30 12:30:43 +0200
commitbc5a341dfe591946e925db315fc7d8c0c25c2938 (patch)
treeb216ab5eee4863e3807d92b5ddd19fa22197ed22 /src/kernels
parentc7b054ea6747039f4405fd93da6e924f3e5c7f4b (diff)
Initial commit of preview version
Diffstat (limited to 'src/kernels')
-rw-r--r--src/kernels/common.opencl120
-rw-r--r--src/kernels/copy.opencl73
-rw-r--r--src/kernels/pad.opencl180
-rw-r--r--src/kernels/padtranspose.opencl150
-rw-r--r--src/kernels/transpose.opencl168
-rw-r--r--src/kernels/xaxpy.opencl128
-rw-r--r--src/kernels/xgemm.opencl570
7 files changed, 1389 insertions, 0 deletions
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
new file mode 100644
index 00000000..154265e4
--- /dev/null
+++ b/src/kernels/common.opencl
@@ -0,0 +1,120 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common defines and type-defs for the CLBlast OpenCL kernels.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this file is used outside of the CLBlast library.
+#ifndef PRECISION
+ #define PRECISION 32 // Data-types: single or double precision, complex or regular
+#endif
+
+// =================================================================================================
+
+// Enable support for double-precision
+#if PRECISION == 64 || PRECISION == 6464
+ #if __OPENCL_VERSION__ <= CL_VERSION_1_1
+ #pragma OPENCL EXTENSION cl_khr_fp64: enable
+ #endif
+#endif
+
+// Single-precision
+#if PRECISION == 32
+ typedef float real;
+ typedef float2 real2;
+ typedef float4 real4;
+ typedef float8 real8;
+ typedef float16 real16;
+ #define ZERO 0.0f
+
+// Double-precision
+#elif PRECISION == 64
+ typedef double real;
+ typedef double2 real2;
+ typedef double4 real4;
+ typedef double8 real8;
+ typedef double16 real16;
+ #define ZERO 0.0
+
+// Complex single-precision
+#elif PRECISION == 3232
+ typedef struct cfloat {float x; float y;} real;
+ typedef struct cfloat2 {real x; real y;} real2;
+ typedef struct cfloat4 {real x; real y; real z; real w;} real4;
+ typedef struct cfloat8 {real s0; real s1; real s2; real s3;
+ real s4; real s5; real s6; real s7;} real8;
+ typedef struct cfloat16 {real s0; real s1; real s2; real s3;
+ real s4; real s5; real s6; real s7;
+ real s8; real s9; real sA; real sB;
+ real sC; real sD; real sE; real sF;} real16;
+ #define ZERO 0.0f
+
+// Complex Double-precision
+#elif PRECISION == 6464
+ typedef struct cdouble {double x; double y;} real;
+ typedef struct cdouble2 {real x; real y;} real2;
+ typedef struct cdouble4 {real x; real y; real z; real w;} real4;
+ typedef struct cdouble8 {real s0; real s1; real s2; real s3;
+ real s4; real s5; real s6; real s7;} real8;
+ typedef struct cdouble16 {real s0; real s1; real s2; real s3;
+ real s4; real s5; real s6; real s7;
+ real s8; real s9; real sA; real sB;
+ real sC; real sD; real sE; real sF;} real16;
+ #define ZERO 0.0
+#endif
+
+// =================================================================================================
+
+// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction
+#define USE_CL_MAD 0
+
+// Sets a variable to zero
+#if PRECISION == 3232 || PRECISION == 6464
+ #define SetToZero(a) a.x = ZERO; a.y = ZERO
+#else
+ #define SetToZero(a) a = ZERO
+#endif
+
+// Multiply two complex variables (used in the define below)
+#if PRECISION == 3232 || PRECISION == 6464
+ #define MulReal(a, b) a.x*b.x - a.y*b.y
+ #define MulImag(a, b) a.x*b.y + a.y*b.x
+#endif
+
+// The scalar multiply-add function
+#if PRECISION == 3232 || PRECISION == 6464
+ #define MultiplyAdd(c, a, b) c.x += MulReal(a,b); c.y += MulImag(a,b)
+#else
+ #if USE_CL_MAD == 1
+ #define MultiplyAdd(c, a, b) c = mad(a, b, c)
+ #else
+ #define MultiplyAdd(c, a, b) c += a * b
+ #endif
+#endif
+
+// The scalar AXPBY function
+#if PRECISION == 3232 || PRECISION == 6464
+ #define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d)
+#else
+ #define AXPBY(e, a, b, c, d) e = a*b + c*d
+#endif
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/copy.opencl b/src/kernels/copy.opencl
new file mode 100644
index 00000000..f95b476b
--- /dev/null
+++ b/src/kernels/copy.opencl
@@ -0,0 +1,73 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS routines. This file contains
+// kernels to copy matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef COPY_DIMX
+ #define COPY_DIMX 8 // Local workgroup size in the first dimension (x)
+#endif
+#ifndef COPY_DIMY
+ #define COPY_DIMY 8 // Local workgroup size in the second dimension (y)
+#endif
+#ifndef COPY_WPT
+ #define COPY_WPT 1 // Work per thread in the first dimension (x)
+#endif
+#ifndef COPY_VW
+ #define COPY_VW 1 // Vector width in the second dimension (y)
+#endif
+
+// =================================================================================================
+
+// Data-widths
+#if COPY_VW == 1
+ typedef real realC;
+#elif COPY_VW == 2
+ typedef real2 realC;
+#elif COPY_VW == 4
+ typedef real4 realC;
+#elif COPY_VW == 8
+ typedef real8 realC;
+#elif COPY_VW == 16
+ typedef real16 realC;
+#endif
+
+// =================================================================================================
+
+// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
+// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
+__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+__kernel void CopyMatrix(const int ld,
+ __global const realC* restrict src,
+ __global realC* dest) {
+ #pragma unroll
+ for (int w_one=0; w_one<COPY_WPT; ++w_one) {
+ const int id_one = get_global_id(0);
+ const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1);
+ const int id = id_two*(ld/COPY_VW) + id_one;
+ dest[id] = src[id];
+ }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/pad.opencl b/src/kernels/pad.opencl
new file mode 100644
index 00000000..ccaeb9d6
--- /dev/null
+++ b/src/kernels/pad.opencl
@@ -0,0 +1,180 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS routines. This file contains
+// kernels to copy and pad matrices in various ways, including:
+// 1) copying into a larger matrix by adding padding
+// 2) copying into a smaller matrix by removing padding
+// 3) from upper/lower triangle into a full matrix
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef PAD_DIMX
+ #define PAD_DIMX 8 // Local workgroup size in the first dimension (x)
+#endif
+#ifndef PAD_DIMY
+ #define PAD_DIMY 8 // Local workgroup size in the second dimension (y)
+#endif
+#ifndef PAD_WPTX
+ #define PAD_WPTX 1 // Work per thread in the first dimension (x)
+#endif
+#ifndef PAD_WPTY
+ #define PAD_WPTY 1 // Work per thread in the second dimension (y)
+#endif
+
+// =================================================================================================
+
+// Copies a matrix from source to destination. The output is padded with zero values in case the
+// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
+// value and offset can be different.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void PadMatrix(const int src_one, const int src_two,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_one, const int dest_two,
+ const int dest_ld, const int dest_offset,
+ __global real* dest) {
+
+ // Loops over the work per thread in both dimensions
+ #pragma unroll
+ for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+ const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+ #pragma unroll
+ for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+ const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+ if (id_two < dest_two && id_one < dest_one) {
+
+ // Loads data if the thread IDs are within bounds of the source matrix. Otherwise, set the
+ // value to be written to zero.
+ real value;
+ SetToZero(value);
+ if (id_two < src_two && id_one < src_one) {
+ value = src[id_two*src_ld + id_one + src_offset];
+ }
+
+ // Stores the value in the destination matrix
+ dest[id_two*dest_ld + id_one + dest_offset] = value;
+ }
+ }
+ }
+}
+
+// =================================================================================================
+
+// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
+// writes only the actual data back to the destination matrix. Again, the ld value and offset can
+// be different.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void UnPadMatrix(const int src_one, const int src_two,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_one, const int dest_two,
+ const int dest_ld, const int dest_offset,
+ __global real* dest) {
+
+ // Loops over the work per thread in both dimensions
+ #pragma unroll
+ for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+ const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+ #pragma unroll
+ for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+ const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+ if (id_two < dest_two && id_one < dest_one) {
+
+ // Copies the value into the destination matrix. This is always within bounds of the source
+ // matrix, as we know that the destination matrix is smaller than the source.
+ dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
+ }
+ }
+ }
+}
+
+// =================================================================================================
+
+// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void SymmLowerToSquared(const int src_dim,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_dim,
+ const int dest_ld, const int dest_offset,
+ __global real* dest) {
+
+ // Loops over the work per thread in both dimensions
+ #pragma unroll
+ for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+ const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+ #pragma unroll
+ for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+ const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+ if (id_two < dest_dim && id_one < dest_dim) {
+
+ // Loads data from the lower-symmetric matrix
+ real value;
+ SetToZero(value);
+ if (id_two < src_dim && id_one < src_dim) {
+ if (id_two <= id_one) { value = src[id_two*src_ld + id_one + src_offset]; }
+ else { value = src[id_one*src_ld + id_two + src_offset]; }
+ }
+
+ // Stores the value in the destination matrix
+ dest[id_two*dest_ld + id_one + dest_offset] = value;
+ }
+ }
+ }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void SymmUpperToSquared(const int src_dim,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_dim,
+ const int dest_ld, const int dest_offset,
+ __global real* dest) {
+
+ // Loops over the work per thread in both dimensions
+ #pragma unroll
+ for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+ const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+ #pragma unroll
+ for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+ const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+ if (id_two < dest_dim && id_one < dest_dim) {
+
+ // Loads data from the upper-symmetric matrix
+ real value;
+ SetToZero(value);
+ if (id_two < src_dim && id_one < src_dim) {
+ if (id_one <= id_two) { value = src[id_two*src_ld + id_one + src_offset]; }
+ else { value = src[id_one*src_ld + id_two + src_offset]; }
+ }
+
+ // Stores the value in the destination matrix
+ dest[id_two*dest_ld + id_one + dest_offset] = value;
+ }
+ }
+ }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/padtranspose.opencl b/src/kernels/padtranspose.opencl
new file mode 100644
index 00000000..67cbf341
--- /dev/null
+++ b/src/kernels/padtranspose.opencl
@@ -0,0 +1,150 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS functions. This file contains
+// kernels to transpose matrices in various ways, including:
+// 1) transposing into a larger matrix by adding padding
+// 2) transposing into a smaller matrix by removing padding
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef PADTRA_TILE
+ #define PADTRA_TILE 8 // Number of local threads in the two dimensions (x,y)
+#endif
+#ifndef PADTRA_WPT
+ #define PADTRA_WPT 1 // Amount of work per thread
+#endif
+#ifndef PADTRA_PAD
+ #define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts
+#endif
+
+// =================================================================================================
+
+// Same as PadCopyMatrix, but now also does the transpose
+__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+__kernel void PadTransposeMatrix(const int src_one, const int src_two,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_one, const int dest_two,
+ const int dest_ld, const int dest_offset,
+ __global real* dest) {
+
+ // Local memory to store a tile of the matrix (for coalescing)
+ __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
+
+ // Loop over the work per thread
+ #pragma unroll
+ for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+ #pragma unroll
+ for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+ // Computes the identifiers for the source matrix. Note that the local and global dimensions
+ // do not correspond to each other!
+ const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0);
+ const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1);
+
+ // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
+ // Otherwise, set the local memory value to zero.
+ real value;
+ SetToZero(value);
+ if (id_src_two < src_two && id_src_one < src_one) {
+ value = src[id_src_two*src_ld + id_src_one + src_offset];
+ }
+ tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
+ }
+ }
+
+ // Synchronizes all threads in a workgroup
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Loop over the work per thread
+ #pragma unroll
+ for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+ #pragma unroll
+ for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+ // Computes the identifiers for the destination matrix
+ const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
+ const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
+
+ // Stores the transposed value in the destination matrix
+ if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+ real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+ dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+ }
+ }
+ }
+}
+
+// Same as UnPadCopyMatrix, but now also does the transpose
+__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+__kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_one, const int dest_two,
+ const int dest_ld, const int dest_offset,
+ __global real* dest) {
+
+ // Local memory to store a tile of the matrix (for coalescing)
+ __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
+
+ // Loop over the work per thread
+ #pragma unroll
+ for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+ #pragma unroll
+ for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+ // Computes the identifiers for the source matrix. Note that the local and global dimensions
+ // do not correspond to each other!
+ const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0);
+ const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1);
+
+ // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
+ if ((id_src_one < src_one) && (id_src_two < src_two)) {
+ real value = src[id_src_two*src_ld + id_src_one + src_offset];
+ tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
+ }
+ }
+ }
+
+ // Synchronizes all threads in a workgroup
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Loop over the work per thread
+ #pragma unroll
+ for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+ #pragma unroll
+ for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+ // Computes the identifiers for the destination matrix
+ const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
+ const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
+
+ // Stores the transposed value in the destination matrix
+ if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+ real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+ dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+ }
+ }
+ }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/transpose.opencl b/src/kernels/transpose.opencl
new file mode 100644
index 00000000..79ab1688
--- /dev/null
+++ b/src/kernels/transpose.opencl
@@ -0,0 +1,168 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS functions. This file contains
+// kernels to transpose matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef TRA_DIM
+ #define TRA_DIM 8 // Number of local threads in the two dimensions (x,y)
+#endif
+#ifndef TRA_WPT
+ #define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other
+#endif
+#ifndef TRA_PAD
+ #define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts
+#endif
+
+// =================================================================================================
+
+// Data-widths
+#if TRA_WPT == 1
+ typedef real realT;
+#elif TRA_WPT == 2
+ typedef real2 realT;
+#elif TRA_WPT == 4
+ typedef real4 realT;
+#elif TRA_WPT == 8
+ typedef real8 realT;
+#elif TRA_WPT == 16
+ typedef real16 realT;
+#endif
+
+// =================================================================================================
+
+// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
+// offset. A more general version is available in 'padtranspose.opencl'.
+__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+__kernel void TransposeMatrix(const int ld,
+ __global const realT* restrict src,
+ __global realT* dest) {
+
+ // Local memory to store a tile of the matrix (for coalescing)
+ __local real tile[TRA_WPT*TRA_DIM][TRA_WPT*TRA_DIM + TRA_PAD];
+
+ // Loop over the work per thread
+ #pragma unroll
+ for (int w_one=0; w_one<TRA_WPT; ++w_one) {
+
+ // Computes the identifiers for the source matrix. Note that the local and global dimensions
+ // do not correspond to each other!
+ const int id_one = get_group_id(1) * TRA_DIM + get_local_id(0);
+ const int id_two = (get_group_id(0) * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
+
+ // Loads data into the local memory
+ realT value = src[id_two*(ld/TRA_WPT) + id_one];
+ #if TRA_WPT == 1
+ tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value;
+ #elif TRA_WPT == 2
+ tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
+ tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
+ #elif TRA_WPT == 4
+ tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
+ tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
+ tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.z;
+ tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.w;
+ #elif TRA_WPT == 8
+ tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
+ tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
+ tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
+ tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
+ tile[get_local_id(1)*TRA_WPT + 4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
+ tile[get_local_id(1)*TRA_WPT + 5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
+ tile[get_local_id(1)*TRA_WPT + 6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
+ tile[get_local_id(1)*TRA_WPT + 7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
+ #elif TRA_WPT == 16
+ tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
+ tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
+ tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
+ tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
+ tile[get_local_id(1)*TRA_WPT + 4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
+ tile[get_local_id(1)*TRA_WPT + 5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
+ tile[get_local_id(1)*TRA_WPT + 6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
+ tile[get_local_id(1)*TRA_WPT + 7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
+ tile[get_local_id(1)*TRA_WPT + 8][get_local_id(0)*TRA_WPT + w_one] = value.s8;
+ tile[get_local_id(1)*TRA_WPT + 9][get_local_id(0)*TRA_WPT + w_one] = value.s9;
+ tile[get_local_id(1)*TRA_WPT + 10][get_local_id(0)*TRA_WPT + w_one] = value.sA;
+ tile[get_local_id(1)*TRA_WPT + 11][get_local_id(0)*TRA_WPT + w_one] = value.sB;
+ tile[get_local_id(1)*TRA_WPT + 12][get_local_id(0)*TRA_WPT + w_one] = value.sC;
+ tile[get_local_id(1)*TRA_WPT + 13][get_local_id(0)*TRA_WPT + w_one] = value.sD;
+ tile[get_local_id(1)*TRA_WPT + 14][get_local_id(0)*TRA_WPT + w_one] = value.sE;
+ tile[get_local_id(1)*TRA_WPT + 15][get_local_id(0)*TRA_WPT + w_one] = value.sF;
+ #endif
+ }
+
+ // Synchronizes all threads in a workgroup
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Loop over the work per thread
+ #pragma unroll
+ for (int w_two=0; w_two<TRA_WPT; ++w_two) {
+
+ // Computes the identifiers for the destination matrix
+ const int id_one = get_global_id(0);
+ const int id_two = get_global_id(1)*TRA_WPT + w_two;
+
+ // Stores the transposed value in the destination matrix
+ realT value;
+ #if TRA_WPT == 1
+ value = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
+ #elif TRA_WPT == 2
+ value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
+ value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
+ #elif TRA_WPT == 4
+ value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
+ value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
+ value.z = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
+ value.w = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
+ #elif TRA_WPT == 8
+ value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
+ value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
+ value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
+ value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
+ value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 4];
+ value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 5];
+ value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 6];
+ value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 7];
+ #elif TRA_WPT == 16
+ value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
+ value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
+ value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
+ value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
+ value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 4];
+ value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 5];
+ value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 6];
+ value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 7];
+ value.s8 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 8];
+ value.s9 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 9];
+ value.sA = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 10];
+ value.sB = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 11];
+ value.sC = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 12];
+ value.sD = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 13];
+ value.sE = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 14];
+ value.sF = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 15];
+ #endif
+ dest[id_two*(ld/TRA_WPT) + id_one] = value;
+ }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/xaxpy.opencl b/src/kernels/xaxpy.opencl
new file mode 100644
index 00000000..40c6c3bd
--- /dev/null
+++ b/src/kernels/xaxpy.opencl
@@ -0,0 +1,128 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit
+// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
+// support vector data-types.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef WGS
+ #define WGS 64 // The local work-group size
+#endif
+#ifndef WPT
+ #define WPT 1 // The amount of work-per-thread
+#endif
+#ifndef VW
+ #define VW 1 // Vector width of vectors X and Y
+#endif
+
+// =================================================================================================
+
+// Data-widths
+#if VW == 1
+ typedef real realV;
+#elif VW == 2
+ typedef real2 realV;
+#elif VW == 4
+ typedef real4 realV;
+#elif VW == 8
+ typedef real8 realV;
+#elif VW == 16
+ typedef real16 realV;
+#endif
+
+// =================================================================================================
+
+// The vectorized multiply-add function
+inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) {
+ #if VW == 1
+ MultiplyAdd(cvec, aval, bvec);
+ #elif VW == 2
+ MultiplyAdd(cvec.x, aval, bvec.x);
+ MultiplyAdd(cvec.y, aval, bvec.y);
+ #elif VW == 4
+ MultiplyAdd(cvec.x, aval, bvec.x);
+ MultiplyAdd(cvec.y, aval, bvec.y);
+ MultiplyAdd(cvec.z, aval, bvec.z);
+ MultiplyAdd(cvec.w, aval, bvec.w);
+ #elif VW == 8
+ MultiplyAdd(cvec.s0, aval, bvec.s0);
+ MultiplyAdd(cvec.s1, aval, bvec.s1);
+ MultiplyAdd(cvec.s2, aval, bvec.s2);
+ MultiplyAdd(cvec.s3, aval, bvec.s3);
+ MultiplyAdd(cvec.s4, aval, bvec.s4);
+ MultiplyAdd(cvec.s5, aval, bvec.s5);
+ MultiplyAdd(cvec.s6, aval, bvec.s6);
+ MultiplyAdd(cvec.s7, aval, bvec.s7);
+ #elif VW == 16
+ MultiplyAdd(cvec.s0, aval, bvec.s0);
+ MultiplyAdd(cvec.s1, aval, bvec.s1);
+ MultiplyAdd(cvec.s2, aval, bvec.s2);
+ MultiplyAdd(cvec.s3, aval, bvec.s3);
+ MultiplyAdd(cvec.s4, aval, bvec.s4);
+ MultiplyAdd(cvec.s5, aval, bvec.s5);
+ MultiplyAdd(cvec.s6, aval, bvec.s6);
+ MultiplyAdd(cvec.s7, aval, bvec.s7);
+ MultiplyAdd(cvec.s8, aval, bvec.s8);
+ MultiplyAdd(cvec.s9, aval, bvec.s9);
+ MultiplyAdd(cvec.sA, aval, bvec.sA);
+ MultiplyAdd(cvec.sB, aval, bvec.sB);
+ MultiplyAdd(cvec.sC, aval, bvec.sC);
+ MultiplyAdd(cvec.sD, aval, bvec.sD);
+ MultiplyAdd(cvec.sE, aval, bvec.sE);
+ MultiplyAdd(cvec.sF, aval, bvec.sF);
+ #endif
+ return cvec;
+}
+
+// =================================================================================================
+
+// Full version of the kernel with offsets and strided accesses
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void Xaxpy(const int n, const real alpha,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ __global real* ygm, const int y_offset, const int y_inc) {
+
+ // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+ #pragma unroll
+ for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+ MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xgm[id*x_inc + x_offset]);
+ }
+}
+
+// =================================================================================================
+
+// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
+// dividable by 'VW', 'WGS' and 'WPT'.
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void XaxpyFast(const int n, const real alpha,
+ const __global realV* restrict xgm,
+ __global realV* ygm) {
+ #pragma unroll
+ for (int w=0; w<WPT; ++w) {
+ const int id = w*get_global_size(0) + get_global_id(0);
+ ygm[id] = MultiplyAddVector(ygm[id], alpha, xgm[id]);
+ }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/xgemm.opencl b/src/kernels/xgemm.opencl
new file mode 100644
index 00000000..facaf5dc
--- /dev/null
+++ b/src/kernels/xgemm.opencl
@@ -0,0 +1,570 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
+// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
+// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
+// supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
+//
+// Matrices are accessed as follows:
+// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
+// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n)
+// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m)
+//
+// Or as an image (assuming column-major)
+// K
+// o-------o
+// | |
+// N | [B^T] |
+// | |
+// o-------o
+// K N
+// o-------o o-----o
+// M | [A] | M | [C] |
+// | | | |
+// o-------o o-----o
+//
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef MWG
+ #define MWG 8 // Tile-size in dimension M (e.g. 64, 128)
+#endif
+#ifndef NWG
+ #define NWG 8 // Tile-size in dimension N (e.g. 64, 128)
+#endif
+#ifndef KWG
+ #define KWG 8 // Tile-size in dimension K (e.g. 8, 16)
+#endif
+#ifndef MDIMC
+ #define MDIMC 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef NDIMC
+ #define NDIMC 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef MDIMA
+ #define MDIMA 8 // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
+#endif
+#ifndef NDIMB
+ #define NDIMB 8 // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
+#endif
+#ifndef KWI
+ #define KWI 1 // Unroll factor of the KWG loop (smaller or equal than KWG)
+#endif
+#ifndef VWM
+ #define VWM 1 // Vector width of matrices A and C
+#endif
+#ifndef VWN
+ #define VWN 1 // Vector width of matrix B
+#endif
+#ifndef STRM
+ #define STRM 0 // Use strided access within a thread in the M-dimension (1) or not (0)
+#endif
+#ifndef STRN
+ #define STRN 0 // Use strided access within a thread in the N-dimension (1) or not (0)
+#endif
+#ifndef SA
+ #define SA 0 // Use local/shared memory to cache matrix A (1) or not (0)
+#endif
+#ifndef SB
+ #define SB 0 // Use local/shared memory to cache matrix B (1) or not (0)
+#endif
+
+// Helper parameters based on the above tuning parameters
+#define MWI (MWG/MDIMC) // Work per work-item (M-dimension)
+#define NWI (NWG/NDIMC) // Work per work-item (N-dimension)
+#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
+#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
+#define MWA (MWG/MDIMA) // Amount of loads-per-thread for matrix A (M-dimension)
+#define KWA (KWG/KDIMA) // Amount of loads-per-thread for matrix A (K-dimension)
+#define KWB (KWG/KDIMB) // Amount of loads-per-thread for matrix B (K-dimension)
+#define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension)
+
+// Settings
+#define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually
+
+// =================================================================================================
+
+// Data-widths in dimension M
+#if VWM == 1
+ typedef real realM;
+#elif VWM == 2
+ typedef real2 realM;
+#elif VWM == 4
+ typedef real4 realM;
+#elif VWM == 8
+ typedef real8 realM;
+#elif VWM == 16
+ typedef real16 realM;
+#endif
+
+// Data-widths in dimension N
+#if VWN == 1
+ typedef real realN;
+#elif VWN == 2
+ typedef real2 realN;
+#elif VWN == 4
+ typedef real4 realN;
+#elif VWN == 8
+ typedef real8 realN;
+#elif VWN == 16
+ typedef real16 realN;
+#endif
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix.
+#if SA == 1
+inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
+ const int kSizeM, const int tid, const int kwg) {
+ const int la0 = tid % MDIMA;
+ const int la1 = tid / MDIMA;
+ #pragma unroll
+ for (int mia=0; mia<MWA/VWM; ++mia) {
+ #pragma unroll
+ for (int kia=0; kia<KWA; ++kia) {
+
+ // Computes the indices based on strided/non-strided access
+ #if STRM == 0
+ int mg = mia + la0*(MWA/VWM);
+ #elif STRM == 1
+ int mg = la0 + mia*MDIMA;
+ #endif
+
+ // Computes the indices for the global memory
+ int kg = kia + la1*KWA;
+ int idm = mg + get_group_id(0)*(MWG/VWM);
+ int idk = kg + kwg;
+
+ // Loads the data from global memory (not transposed) into the local memory
+ alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm];
+ }
+ }
+}
+#endif
+
+// Same as above, but now for the B input matrix
+#if SB == 1
+inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
+ const int kSizeN, const int tid, const int kwg) {
+ const int lb0 = tid % NDIMB;
+ const int lb1 = tid / NDIMB;
+ #pragma unroll
+ for (int kib=0; kib<KWB; ++kib) {
+ #pragma unroll
+ for (int nib=0; nib<NWB/VWN; ++nib) {
+
+ // Computes the indices based on strided/non-strided access
+ #if STRN == 0
+ int ng = nib + lb0*(NWB/VWN);
+ #elif STRN == 1
+ int ng = lb0 + nib*NDIMB;
+ #endif
+
+ // Computes the indices for the global memory
+ int kg = kib + lb1*KWB;
+ int idn = ng + get_group_id(1)*(NWG/VWN);
+ int idk = kg + kwg;
+
+ // Loads the data from global memory (transposed) into the local memory
+ blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn];
+ }
+ }
+}
+#endif
+
+// =================================================================================================
+
+// Caches global off-chip memory directly into per-thread private memory (registers). This function
+// is specific for caching the A input matrix.
+#if SA == 0
+inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
+ const int kSizeM, const int idk, const int kwg) {
+ #pragma unroll
+ for (int mi=0; mi<MWI/VWM; ++mi) {
+
+ // Computes the indices based on strided/non-strided access
+ #if STRM == 0
+ int mg = mi + get_local_id(0)*(MWI/VWM);
+ #elif STRM == 1
+ int mg = get_local_id(0) + mi*MDIMC;
+ #endif
+
+ // Computes the indices for the global memory
+ int idm = mg + get_group_id(0)*(MWG/VWM);
+
+ // Loads the data from global memory (not transposed) and stores into registers
+ apm[mi] = agm[idk*(kSizeM/VWM) + idm];
+ }
+}
+#endif
+
+// Same as above, but now for the B input matrix
+#if SB == 0
+inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
+ const int kSizeN, const int idk) {
+ #pragma unroll
+ for (int ni=0; ni<NWI/VWN; ++ni) {
+
+ // Computes the indices based on strided/non-strided access
+ #if STRN == 0
+ int ng = ni + get_local_id(1)*(NWI/VWN);
+ #elif STRN == 1
+ int ng = get_local_id(1) + ni*NDIMC;
+ #endif
+
+ // Computes the indices for the global memory
+ int idn = ng + get_group_id(1)*(NWG/VWN);
+
+ // Loads the data from global memory (transposed) and stores into registers
+ bpm[ni] = bgm[idk*(kSizeN/VWN) + idn];
+ }
+}
+#endif
+
+// =================================================================================================
+
+// Caches on-chip local memory into per-thread private memory (registers). This function is specific
+// for caching the A input matrix.
+#if SA == 1
+inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
+ #pragma unroll
+ for (int mi=0; mi<MWI/VWM; ++mi) {
+ #if STRM == 0
+ int mg = mi + get_local_id(0)*(MWI/VWM);
+ #elif STRM == 1
+ int mg = get_local_id(0) + mi*MDIMC;
+ #endif
+ apm[mi] = alm[kg*(MWG/VWM) + mg];
+ }
+}
+#endif
+
+// Same as above, but now for the B input matrix
+#if SB == 1
+inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
+ #pragma unroll
+ for (int ni=0; ni<NWI/VWN; ++ni) {
+ #if STRN == 0
+ int ng = ni + get_local_id(1)*(NWI/VWN);
+ #elif STRN == 1
+ int ng = get_local_id(1) + ni*NDIMC;
+ #endif
+ bpm[ni] = blm[kg*(NWG/VWN) + ng];
+ }
+}
+#endif
+
+// =================================================================================================
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
+ const real alpha, const real beta) {
+ #pragma unroll
+ for (int ni=0; ni<NWI; ++ni) {
+ #pragma unroll
+ for (int mi=0; mi<MWI/VWM; ++mi) {
+ #if STRM == 0
+ int mg = mi + get_local_id(0)*(MWI/VWM);
+ #elif STRM == 1
+ int mg = get_local_id(0) + mi*MDIMC;
+ #endif
+ #if STRN == 0
+ int ng = ni + get_local_id(1)*NWI;
+ #elif STRN == 1
+ int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
+ #endif
+ int idm = mg + get_group_id(0)*(MWG/VWM);
+ int idn = ng + get_group_id(1)*NWG;
+ int index = idn*(kSizeM/VWM) + idm;
+ #if VWM == 1
+ AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cgm[index]);
+ #elif VWM == 2
+ AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cgm[index].x);
+ AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cgm[index].y);
+ #elif VWM == 4
+ AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cgm[index].x);
+ AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cgm[index].y);
+ AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cgm[index].z);
+ AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cgm[index].w);
+ #elif VWM == 8
+ AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cgm[index].s0);
+ AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cgm[index].s1);
+ AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cgm[index].s2);
+ AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cgm[index].s3);
+ AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cgm[index].s4);
+ AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cgm[index].s5);
+ AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cgm[index].s6);
+ AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cgm[index].s7);
+ #elif VWM == 16
+ AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cgm[index].s0);
+ AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cgm[index].s1);
+ AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cgm[index].s2);
+ AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cgm[index].s3);
+ AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cgm[index].s4);
+ AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cgm[index].s5);
+ AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cgm[index].s6);
+ AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cgm[index].s7);
+ AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cgm[index].s8);
+ AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cgm[index].s9);
+ AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cgm[index].sA);
+ AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cgm[index].sB);
+ AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cgm[index].sC);
+ AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cgm[index].sD);
+ AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cgm[index].sE);
+ AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cgm[index].sF);
+ #endif
+ }
+ }
+}
+
+// =================================================================================================
+
+// The vectorised multiply-add function
+inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
+ #if USE_VECTOR_MAD == 1
+ cvec += avec * bval;
+ #else
+ #if VWM == 1
+ MultiplyAdd(cvec, avec, bval);
+ #elif VWM == 2
+ MultiplyAdd(cvec.x , avec.x, bval);
+ MultiplyAdd(cvec.y , avec.y, bval);
+ #elif VWM == 4
+ MultiplyAdd(cvec.x , avec.x, bval);
+ MultiplyAdd(cvec.y , avec.y, bval);
+ MultiplyAdd(cvec.z , avec.z, bval);
+ MultiplyAdd(cvec.w , avec.w, bval);
+ #elif VWM == 8
+ MultiplyAdd(cvec.s0, avec.s0, bval);
+ MultiplyAdd(cvec.s1, avec.s1, bval);
+ MultiplyAdd(cvec.s2, avec.s2, bval);
+ MultiplyAdd(cvec.s3, avec.s3, bval);
+ MultiplyAdd(cvec.s4, avec.s4, bval);
+ MultiplyAdd(cvec.s5, avec.s5, bval);
+ MultiplyAdd(cvec.s6, avec.s6, bval);
+ MultiplyAdd(cvec.s7, avec.s7, bval);
+ #elif VWM == 16
+ MultiplyAdd(cvec.s0, avec.s0, bval);
+ MultiplyAdd(cvec.s1, avec.s1, bval);
+ MultiplyAdd(cvec.s2, avec.s2, bval);
+ MultiplyAdd(cvec.s3, avec.s3, bval);
+ MultiplyAdd(cvec.s4, avec.s4, bval);
+ MultiplyAdd(cvec.s5, avec.s5, bval);
+ MultiplyAdd(cvec.s6, avec.s6, bval);
+ MultiplyAdd(cvec.s7, avec.s7, bval);
+ MultiplyAdd(cvec.s8, avec.s8, bval);
+ MultiplyAdd(cvec.s9, avec.s9, bval);
+ MultiplyAdd(cvec.sA, avec.sA, bval);
+ MultiplyAdd(cvec.sB, avec.sB, bval);
+ MultiplyAdd(cvec.sC, avec.sC, bval);
+ MultiplyAdd(cvec.sD, avec.sD, bval);
+ MultiplyAdd(cvec.sE, avec.sE, bval);
+ MultiplyAdd(cvec.sF, avec.sF, bval);
+ #endif
+ #endif
+ return cvec;
+}
+
+// Performs the actual computation: Cpm += Apm * Bpm
+inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], realN bpm[NWI/VWN]) {
+ #pragma unroll
+ for (int ni=0; ni<NWI/VWN; ++ni) {
+ #pragma unroll
+ for (int mi=0; mi<MWI/VWM; ++mi) {
+ #if VWN == 1
+ cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni]);
+ #elif VWN == 2
+ cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].x);
+ cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].y);
+ #elif VWN == 4
+ cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].x);
+ cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].y);
+ cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], apm[mi], bpm[ni].z);
+ cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], apm[mi], bpm[ni].w);
+ #elif VWN == 8
+ cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].s0);
+ cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].s1);
+ cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], apm[mi], bpm[ni].s2);
+ cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], apm[mi], bpm[ni].s3);
+ cpm[ni*VWN + 4][mi] = MultiplyAddVector(cpm[ni*VWN + 4][mi], apm[mi], bpm[ni].s4);
+ cpm[ni*VWN + 5][mi] = MultiplyAddVector(cpm[ni*VWN + 5][mi], apm[mi], bpm[ni].s5);
+ cpm[ni*VWN + 6][mi] = MultiplyAddVector(cpm[ni*VWN + 6][mi], apm[mi], bpm[ni].s6);
+ cpm[ni*VWN + 7][mi] = MultiplyAddVector(cpm[ni*VWN + 7][mi], apm[mi], bpm[ni].s7);
+ #elif VWN == 16
+ cpm[ni*VWN + 0 ][mi] = MultiplyAddVector(cpm[ni*VWN + 0 ][mi], apm[mi], bpm[ni].s0);
+ cpm[ni*VWN + 1 ][mi] = MultiplyAddVector(cpm[ni*VWN + 1 ][mi], apm[mi], bpm[ni].s1);
+ cpm[ni*VWN + 2 ][mi] = MultiplyAddVector(cpm[ni*VWN + 2 ][mi], apm[mi], bpm[ni].s2);
+ cpm[ni*VWN + 3 ][mi] = MultiplyAddVector(cpm[ni*VWN + 3 ][mi], apm[mi], bpm[ni].s3);
+ cpm[ni*VWN + 4 ][mi] = MultiplyAddVector(cpm[ni*VWN + 4 ][mi], apm[mi], bpm[ni].s4);
+ cpm[ni*VWN + 5 ][mi] = MultiplyAddVector(cpm[ni*VWN + 5 ][mi], apm[mi], bpm[ni].s5);
+ cpm[ni*VWN + 6 ][mi] = MultiplyAddVector(cpm[ni*VWN + 6 ][mi], apm[mi], bpm[ni].s6);
+ cpm[ni*VWN + 7 ][mi] = MultiplyAddVector(cpm[ni*VWN + 7 ][mi], apm[mi], bpm[ni].s7);
+ cpm[ni*VWN + 8 ][mi] = MultiplyAddVector(cpm[ni*VWN + 8 ][mi], apm[mi], bpm[ni].s8);
+ cpm[ni*VWN + 9 ][mi] = MultiplyAddVector(cpm[ni*VWN + 9 ][mi], apm[mi], bpm[ni].s9);
+ cpm[ni*VWN + 10][mi] = MultiplyAddVector(cpm[ni*VWN + 10][mi], apm[mi], bpm[ni].sA);
+ cpm[ni*VWN + 11][mi] = MultiplyAddVector(cpm[ni*VWN + 11][mi], apm[mi], bpm[ni].sB);
+ cpm[ni*VWN + 12][mi] = MultiplyAddVector(cpm[ni*VWN + 12][mi], apm[mi], bpm[ni].sC);
+ cpm[ni*VWN + 13][mi] = MultiplyAddVector(cpm[ni*VWN + 13][mi], apm[mi], bpm[ni].sD);
+ cpm[ni*VWN + 14][mi] = MultiplyAddVector(cpm[ni*VWN + 14][mi], apm[mi], bpm[ni].sE);
+ cpm[ni*VWN + 15][mi] = MultiplyAddVector(cpm[ni*VWN + 15][mi], apm[mi], bpm[ni].sF);
+ #endif
+ }
+ }
+}
+
+// =================================================================================================
+
+// Main entry of the kernel. This function contains the basic skeleton, the functionality is
+// provided by the inlined functions above
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+ const real alpha, const real beta,
+ const __global realM* restrict agm,
+ const __global realN* restrict bgm,
+ __global realM* cgm) {
+
+ // Combined thread identifier
+ #if SA == 1 || SB == 1
+ volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
+ #endif
+
+ // Allocates workgroup-private memory (local memory)
+ #if SA == 1
+ __local realM alm[KWG * MWG/VWM];
+ #endif
+ #if SB == 1
+ __local realN blm[KWG * NWG/VWN];
+ #endif
+
+ // Allocates workitem-private memory (registers)
+ realM apm[MWI/VWM];
+ realN bpm[NWI/VWN];
+ realM cpm[NWI][MWI/VWM];
+
+ // Initializes the accumulation registers
+ #pragma unroll
+ for (int mi=0; mi<MWI/VWM; ++mi) {
+ #pragma unroll
+ for (int ni=0; ni<NWI; ++ni) {
+ #if VWM == 1
+ SetToZero(cpm[ni][mi]);
+ #elif VWM == 2
+ SetToZero(cpm[ni][mi].x);
+ SetToZero(cpm[ni][mi].y);
+ #elif VWM == 4
+ SetToZero(cpm[ni][mi].x);
+ SetToZero(cpm[ni][mi].y);
+ SetToZero(cpm[ni][mi].z);
+ SetToZero(cpm[ni][mi].w);
+ #elif VWM == 8
+ SetToZero(cpm[ni][mi].s0);
+ SetToZero(cpm[ni][mi].s1);
+ SetToZero(cpm[ni][mi].s2);
+ SetToZero(cpm[ni][mi].s3);
+ SetToZero(cpm[ni][mi].s4);
+ SetToZero(cpm[ni][mi].s5);
+ SetToZero(cpm[ni][mi].s6);
+ SetToZero(cpm[ni][mi].s7);
+ #elif VWM == 16
+ SetToZero(cpm[ni][mi].s0);
+ SetToZero(cpm[ni][mi].s1);
+ SetToZero(cpm[ni][mi].s2);
+ SetToZero(cpm[ni][mi].s3);
+ SetToZero(cpm[ni][mi].s4);
+ SetToZero(cpm[ni][mi].s5);
+ SetToZero(cpm[ni][mi].s6);
+ SetToZero(cpm[ni][mi].s7);
+ SetToZero(cpm[ni][mi].s8);
+ SetToZero(cpm[ni][mi].s9);
+ SetToZero(cpm[ni][mi].sA);
+ SetToZero(cpm[ni][mi].sB);
+ SetToZero(cpm[ni][mi].sC);
+ SetToZero(cpm[ni][mi].sD);
+ SetToZero(cpm[ni][mi].sE);
+ SetToZero(cpm[ni][mi].sF);
+ #endif
+ }
+ }
+
+ // Loops over all workgroup tiles
+ for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
+
+ // Loads data: off-chip --> local (matrix A)
+ #if SA == 1
+ GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
+ #endif
+ // Loads data: off-chip --> local (matrix B)
+ #if SB == 1
+ GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
+ #endif
+
+ // Synchronizes all threads in a workgroup
+ #if SA == 1 || SB == 1
+ barrier(CLK_LOCAL_MEM_FENCE);
+ #endif
+
+ // Loops over all workitem tiles, unrolled by a factor KWI
+ for (int pwi=0; pwi<KWG; pwi+=KWI) {
+ #pragma unroll
+ for (int pit=0; pit<KWI; ++pit) {
+ #if SA == 0 || SB == 0
+ int idk = kwg + pwi + pit;
+ #endif
+ #if SA == 1 || SB == 1
+ int kg = pwi+pit;
+ #endif
+
+ // Loads data: local --> private (matrix A)
+ #if SA == 1
+ LocalToPrivateA(alm, apm, kg);
+ // Loads data: off-chip --> private (matrix A)
+ #else
+ GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
+ #endif
+
+ // Loads data: local --> private (matrix B)
+ #if SB == 1
+ LocalToPrivateB(blm, bpm, kg);
+ // Loads data: off-chip --> private (matrix B)
+ #else
+ GlobalToPrivateB(bgm, bpm, kSizeN, idk);
+ #endif
+
+ // Performs the accumulation (Cpm += Apm * Bpm)
+ MultiplyAccumulate(cpm, apm, bpm);
+ }
+ }
+
+ // Synchronizes all threads in a workgroup
+ #if SA == 1 || SB == 1
+ barrier(CLK_LOCAL_MEM_FENCE);
+ #endif
+ }
+
+ // Stores an MWG * NWG tile of results and perform the multiplication with alpha and beta
+ StoreResults(cgm, cpm, kSizeM, alpha, beta);
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================