summaryrefslogtreecommitdiff
path: root/src/kernels
diff options
context:
space:
mode:
Diffstat (limited to 'src/kernels')
-rw-r--r--src/kernels/common.opencl7
-rw-r--r--src/kernels/level1/xamax.opencl16
-rw-r--r--src/kernels/level1/xasum.opencl14
-rw-r--r--src/kernels/level1/xaxpy.opencl16
-rw-r--r--src/kernels/level1/xcopy.opencl16
-rw-r--r--src/kernels/level1/xdot.opencl16
-rw-r--r--src/kernels/level1/xnrm2.opencl14
-rw-r--r--src/kernels/level1/xscal.opencl15
-rw-r--r--src/kernels/level1/xswap.opencl16
-rw-r--r--src/kernels/level2/xgemv.opencl4
-rw-r--r--src/kernels/level2/xgemv_fast.opencl40
-rw-r--r--src/kernels/level2/xger.opencl14
-rw-r--r--src/kernels/level2/xher.opencl12
-rw-r--r--src/kernels/level2/xher2.opencl14
-rw-r--r--src/kernels/level3/convert_hermitian.opencl28
-rw-r--r--src/kernels/level3/convert_symmetric.opencl28
-rw-r--r--src/kernels/level3/convert_triangular.opencl32
-rw-r--r--src/kernels/level3/copy_fast.opencl10
-rw-r--r--src/kernels/level3/copy_pad.opencl38
-rw-r--r--src/kernels/level3/transpose_fast.opencl10
-rw-r--r--src/kernels/level3/transpose_pad.opencl38
-rw-r--r--src/kernels/level3/xgemm_part1.opencl2
-rw-r--r--src/kernels/level3/xgemm_part2.opencl324
-rw-r--r--src/kernels/level3/xgemm_part3.opencl229
24 files changed, 515 insertions, 438 deletions
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 2fca6b73..b0817242 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -148,6 +148,13 @@ R"(
#define SetToOne(a) a = ONE
#endif
+// Determines whether a variable is zero
+#if PRECISION == 3232 || PRECISION == 6464
+ #define IsZero(a) ((a.x == ZERO) && (a.y == ZERO))
+#else
+ #define IsZero(a) (a == ZERO)
+#endif
+
// The absolute value (component-wise)
#if PRECISION == 3232 || PRECISION == 6464
#define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y)
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 48d0eb5c..48ad2e75 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -30,10 +30,10 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the loading and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xamax(const int n,
- const __global real* restrict xgm, const int x_offset, const int x_inc,
- __global singlereal* maxgm, __global unsigned int* imaxgm) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xamax(const int n,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ __global singlereal* maxgm, __global unsigned int* imaxgm) {
__local singlereal maxlm[WGS1];
__local unsigned int imaxlm[WGS1];
const int lid = get_local_id(0);
@@ -95,10 +95,10 @@ __kernel void Xamax(const int n,
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm,
- const __global unsigned int* restrict imaxgm,
- __global unsigned int* imax, const int imax_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XamaxEpilogue(const __global singlereal* restrict maxgm,
+ const __global unsigned int* restrict imaxgm,
+ __global unsigned int* imax, const int imax_offset) {
__local singlereal maxlm[WGS2];
__local unsigned int imaxlm[WGS2];
const int lid = get_local_id(0);
diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl
index 58d0f11b..1fc91be8 100644
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@@ -30,10 +30,10 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the loading and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xasum(const int n,
- const __global real* restrict xgm, const int x_offset, const int x_inc,
- __global real* output) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xasum(const int n,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ __global real* output) {
__local real lm[WGS1];
const int lid = get_local_id(0);
const int wgid = get_group_id(0);
@@ -74,9 +74,9 @@ __kernel void Xasum(const int n,
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XasumEpilogue(const __global real* restrict input,
- __global real* asum, const int asum_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XasumEpilogue(const __global real* restrict input,
+ __global real* asum, const int asum_offset) {
__local real lm[WGS2];
const int lid = get_local_id(0);
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index d533041b..ece8476e 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -22,10 +22,10 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xaxpy(const int n, const real_arg arg_alpha,
- const __global real* restrict xgm, const int x_offset, const int x_inc,
- __global real* ygm, const int y_offset, const int y_inc) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xaxpy(const int n, const real_arg arg_alpha,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ __global real* ygm, const int y_offset, const int y_inc) {
const real alpha = GetRealArg(arg_alpha);
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
@@ -40,10 +40,10 @@ __kernel void Xaxpy(const int n, const real_arg arg_alpha,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XaxpyFast(const int n, const real_arg arg_alpha,
- const __global realV* restrict xgm,
- __global realV* ygm) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyFast(const int n, const real_arg arg_alpha,
+ const __global realV* restrict xgm,
+ __global realV* ygm) {
const real alpha = GetRealArg(arg_alpha);
#pragma unroll
diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl
index 97c27ccf..228e0735 100644
--- a/src/kernels/level1/xcopy.opencl
+++ b/src/kernels/level1/xcopy.opencl
@@ -22,10 +22,10 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xcopy(const int n,
- const __global real* restrict xgm, const int x_offset, const int x_inc,
- __global real* ygm, const int y_offset, const int y_inc) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xcopy(const int n,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ __global real* ygm, const int y_offset, const int y_inc) {
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
#pragma unroll
@@ -38,10 +38,10 @@ __kernel void Xcopy(const int n,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XcopyFast(const int n,
- const __global realV* restrict xgm,
- __global realV* ygm) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XcopyFast(const int n,
+ const __global realV* restrict xgm,
+ __global realV* ygm) {
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id = w*get_global_size(0) + get_global_id(0);
diff --git a/src/kernels/level1/xdot.opencl b/src/kernels/level1/xdot.opencl
index e13eb3c1..02f04ea7 100644
--- a/src/kernels/level1/xdot.opencl
+++ b/src/kernels/level1/xdot.opencl
@@ -30,11 +30,11 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the multiplication and the majority of the sum operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xdot(const int n,
- const __global real* restrict xgm, const int x_offset, const int x_inc,
- const __global real* restrict ygm, const int y_offset, const int y_inc,
- __global real* output, const int do_conjugate) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xdot(const int n,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ const __global real* restrict ygm, const int y_offset, const int y_inc,
+ __global real* output, const int do_conjugate) {
__local real lm[WGS1];
const int lid = get_local_id(0);
const int wgid = get_group_id(0);
@@ -73,9 +73,9 @@ __kernel void Xdot(const int n,
// The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to
// be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XdotEpilogue(const __global real* restrict input,
- __global real* dot, const int dot_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XdotEpilogue(const __global real* restrict input,
+ __global real* dot, const int dot_offset) {
__local real lm[WGS2];
const int lid = get_local_id(0);
diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl
index 9803687a..f6d869cb 100644
--- a/src/kernels/level1/xnrm2.opencl
+++ b/src/kernels/level1/xnrm2.opencl
@@ -30,10 +30,10 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the multiplication and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xnrm2(const int n,
- const __global real* restrict xgm, const int x_offset, const int x_inc,
- __global real* output) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xnrm2(const int n,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ __global real* output) {
__local real lm[WGS1];
const int lid = get_local_id(0);
const int wgid = get_group_id(0);
@@ -72,9 +72,9 @@ __kernel void Xnrm2(const int n,
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void Xnrm2Epilogue(const __global real* restrict input,
- __global real* nrm2, const int nrm2_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void Xnrm2Epilogue(const __global real* restrict input,
+ __global real* nrm2, const int nrm2_offset) {
__local real lm[WGS2];
const int lid = get_local_id(0);
diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl
index 59936776..3da9c2fd 100644
--- a/src/kernels/level1/xscal.opencl
+++ b/src/kernels/level1/xscal.opencl
@@ -22,9 +22,10 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xscal(const int n, const real alpha,
- __global real* xgm, const int x_offset, const int x_inc) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xscal(const int n, const real_arg arg_alpha,
+ __global real* xgm, const int x_offset, const int x_inc) {
+ const real alpha = GetRealArg(arg_alpha);
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
#pragma unroll
@@ -40,9 +41,11 @@ __kernel void Xscal(const int n, const real alpha,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XscalFast(const int n, const real alpha,
- __global realV* xgm) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XscalFast(const int n, const real_arg arg_alpha,
+ __global realV* xgm) {
+ const real alpha = GetRealArg(arg_alpha);
+
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id = w*get_global_size(0) + get_global_id(0);
diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl
index f6487b58..267271c0 100644
--- a/src/kernels/level1/xswap.opencl
+++ b/src/kernels/level1/xswap.opencl
@@ -22,10 +22,10 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xswap(const int n,
- __global real* xgm, const int x_offset, const int x_inc,
- __global real* ygm, const int y_offset, const int y_inc) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xswap(const int n,
+ __global real* xgm, const int x_offset, const int x_inc,
+ __global real* ygm, const int y_offset, const int y_inc) {
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
#pragma unroll
@@ -40,10 +40,10 @@ __kernel void Xswap(const int n,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XswapFast(const int n,
- __global realV* xgm,
- __global realV* ygm) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XswapFast(const int n,
+ __global realV* xgm,
+ __global realV* ygm) {
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id = w*get_global_size(0) + get_global_id(0);
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 83b6b15d..ff011acd 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -210,8 +210,8 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
// =================================================================================================
// Full version of the kernel
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xgemv(const int m, const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xgemv(const int m, const int n,
const real_arg arg_alpha,
const real_arg arg_beta,
const int a_rotated,
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 210c42c1..02a1f956 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -88,16 +88,16 @@ inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, co
// --> 'a_ld' is a multiple of VW2
// --> 'a_rotated' is 0
// --> 'do_conjugate' is 0
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XgemvFast(const int m, const int n,
- const real_arg arg_alpha,
- const real_arg arg_beta,
- const int a_rotated,
- const __global realVF* restrict agm, const int a_offset, const int a_ld,
- const __global real* restrict xgm, const int x_offset, const int x_inc,
- __global real* ygm, const int y_offset, const int y_inc,
- const int do_conjugate, const int parameter,
- const int kl_unused, const int ku_unused) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XgemvFast(const int m, const int n,
+ const real_arg arg_alpha,
+ const real_arg arg_beta,
+ const int a_rotated,
+ const __global realVF* restrict agm, const int a_offset, const int a_ld,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ __global real* ygm, const int y_offset, const int y_inc,
+ const int do_conjugate, const int parameter,
+ const int kl_unused, const int ku_unused) {
const real alpha = GetRealArg(arg_alpha);
const real beta = GetRealArg(arg_beta);
@@ -190,16 +190,16 @@ __kernel void XgemvFast(const int m, const int n,
// --> 'a_ld' is a multiple of VW3
// --> 'a_rotated' is 1
// --> 'do_conjugate' is 0
-__attribute__((reqd_work_group_size(WGS3, 1, 1)))
-__kernel void XgemvFastRot(const int m, const int n,
- const real_arg arg_alpha,
- const real_arg arg_beta,
- const int a_rotated,
- const __global realVFR* restrict agm, const int a_offset, const int a_ld,
- const __global real* restrict xgm, const int x_offset, const int x_inc,
- __global real* ygm, const int y_offset, const int y_inc,
- const int do_conjugate, const int parameter,
- const int kl_unused, const int ku_unused) {
+__kernel __attribute__((reqd_work_group_size(WGS3, 1, 1)))
+void XgemvFastRot(const int m, const int n,
+ const real_arg arg_alpha,
+ const real_arg arg_beta,
+ const int a_rotated,
+ const __global realVFR* restrict agm, const int a_offset, const int a_ld,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ __global real* ygm, const int y_offset, const int y_inc,
+ const int do_conjugate, const int parameter,
+ const int kl_unused, const int ku_unused) {
const real alpha = GetRealArg(arg_alpha);
const real beta = GetRealArg(arg_beta);
diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl
index f218a346..1b9ded12 100644
--- a/src/kernels/level2/xger.opencl
+++ b/src/kernels/level2/xger.opencl
@@ -18,13 +18,13 @@ R"(
// =================================================================================================
// Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xger(const int max1, const int max2,
- const real_arg arg_alpha,
- const __global real* restrict xgm, const int x_offset, const int x_inc,
- const __global real* ygm, const int y_offset, const int y_inc,
- __global real* restrict agm, const int a_offset, const int a_ld,
- const int is_rowmajor) {
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xger(const int max1, const int max2,
+ const real_arg arg_alpha,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ const __global real* ygm, const int y_offset, const int y_inc,
+ __global real* restrict agm, const int a_offset, const int a_ld,
+ const int is_rowmajor) {
const real alpha = GetRealArg(arg_alpha);
// Register storage for X and Y
diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl
index 1200ee63..b0772218 100644
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@@ -18,12 +18,12 @@ R"(
// =================================================================================================
// Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher(const int n,
- const real_arg arg_alpha,
- const __global real* restrict xgm, const int x_offset, const int x_inc,
- __global real* restrict agm, const int a_offset, const int a_ld,
- const int is_upper, const int is_rowmajor) {
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xher(const int n,
+ const real_arg arg_alpha,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ __global real* restrict agm, const int a_offset, const int a_ld,
+ const int is_upper, const int is_rowmajor) {
const real alpha = GetRealArg(arg_alpha);
// Register storage for X and XT
diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl
index d0f41571..00a756c9 100644
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@@ -18,13 +18,13 @@ R"(
// =================================================================================================
// Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher2(const int n,
- const real_arg arg_alpha,
- const __global real* restrict xgm, const int x_offset, const int x_inc,
- const __global real* restrict ygm, const int y_offset, const int y_inc,
- __global real* restrict agm, const int a_offset, const int a_ld,
- const int is_upper, const int is_rowmajor) {
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xher2(const int n,
+ const real_arg arg_alpha,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ const __global real* restrict ygm, const int y_offset, const int y_inc,
+ __global real* restrict agm, const int a_offset, const int a_ld,
+ const int is_upper, const int is_rowmajor) {
const real alpha = GetRealArg(arg_alpha);
// Register storage for X and Y
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl
index 53cc161a..ed2ded98 100644
--- a/src/kernels/level3/convert_hermitian.opencl
+++ b/src/kernels/level3/convert_hermitian.opencl
@@ -20,13 +20,13 @@ R"(
// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermLowerToSquared(const int src_dim,
- const int src_ld, const int src_offset,
- __global const real* restrict src,
- const int dest_dim,
- const int dest_ld, const int dest_offset,
- __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void HermLowerToSquared(const int src_dim,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_dim,
+ const int dest_ld, const int dest_offset,
+ __global real* dest) {
// Loops over the work per thread in both dimensions
#pragma unroll
@@ -59,13 +59,13 @@ __kernel void HermLowerToSquared(const int src_dim,
}
// Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermUpperToSquared(const int src_dim,
- const int src_ld, const int src_offset,
- __global const real* restrict src,
- const int dest_dim,
- const int dest_ld, const int dest_offset,
- __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void HermUpperToSquared(const int src_dim,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_dim,
+ const int dest_ld, const int dest_offset,
+ __global real* dest) {
// Loops over the work per thread in both dimensions
#pragma unroll
diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl
index c6ce93ca..8ae53b37 100644
--- a/src/kernels/level3/convert_symmetric.opencl
+++ b/src/kernels/level3/convert_symmetric.opencl
@@ -20,13 +20,13 @@ R"(
// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmLowerToSquared(const int src_dim,
- const int src_ld, const int src_offset,
- __global const real* restrict src,
- const int dest_dim,
- const int dest_ld, const int dest_offset,
- __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void SymmLowerToSquared(const int src_dim,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_dim,
+ const int dest_ld, const int dest_offset,
+ __global real* dest) {
// Loops over the work per thread in both dimensions
#pragma unroll
@@ -53,13 +53,13 @@ __kernel void SymmLowerToSquared(const int src_dim,
}
// Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmUpperToSquared(const int src_dim,
- const int src_ld, const int src_offset,
- __global const real* restrict src,
- const int dest_dim,
- const int dest_ld, const int dest_offset,
- __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void SymmUpperToSquared(const int src_dim,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_dim,
+ const int dest_ld, const int dest_offset,
+ __global real* dest) {
// Loops over the work per thread in both dimensions
#pragma unroll
diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl
index fdd2461a..f848dcc1 100644
--- a/src/kernels/level3/convert_triangular.opencl
+++ b/src/kernels/level3/convert_triangular.opencl
@@ -20,14 +20,14 @@ R"(
// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TriaLowerToSquared(const int src_dim,
- const int src_ld, const int src_offset,
- __global const real* restrict src,
- const int dest_dim,
- const int dest_ld, const int dest_offset,
- __global real* dest,
- const int unit_diagonal) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void TriaLowerToSquared(const int src_dim,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_dim,
+ const int dest_ld, const int dest_offset,
+ __global real* dest,
+ const int unit_diagonal) {
// Loops over the work per thread in both dimensions
#pragma unroll
@@ -55,14 +55,14 @@ __kernel void TriaLowerToSquared(const int src_dim,
}
// Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TriaUpperToSquared(const int src_dim,
- const int src_ld, const int src_offset,
- __global const real* restrict src,
- const int dest_dim,
- const int dest_ld, const int dest_offset,
- __global real* dest,
- const int unit_diagonal) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void TriaUpperToSquared(const int src_dim,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_dim,
+ const int dest_ld, const int dest_offset,
+ __global real* dest,
+ const int unit_diagonal) {
// Loops over the work per thread in both dimensions
#pragma unroll
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
index dd975bf1..695b9003 100644
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@@ -35,11 +35,11 @@ R"(
// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
-__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
-__kernel void CopyMatrixFast(const int ld,
- __global const realC* restrict src,
- __global realC* dest,
- const real_arg arg_alpha) {
+__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+void CopyMatrixFast(const int ld,
+ __global const realC* restrict src,
+ __global realC* dest,
+ const real_arg arg_alpha) {
const real alpha = GetRealArg(arg_alpha);
#pragma unroll
for (int w_one=0; w_one<COPY_WPT; ++w_one) {
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index d0771c31..29480b25 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -24,15 +24,15 @@ R"(
// Copies a matrix from source to destination. The output is padded with zero values in case the
// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
// value and offset can be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void CopyPadMatrix(const int src_one, const int src_two,
- const int src_ld, const int src_offset,
- __global const real* restrict src,
- const int dest_one, const int dest_two,
- const int dest_ld, const int dest_offset,
- __global real* dest,
- const real_arg arg_alpha,
- const int do_conjugate) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyPadMatrix(const int src_one, const int src_two,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_one, const int dest_two,
+ const int dest_ld, const int dest_offset,
+ __global real* dest,
+ const real_arg arg_alpha,
+ const int do_conjugate) {
const real alpha = GetRealArg(arg_alpha);
// Loops over the work per thread in both dimensions
@@ -65,16 +65,16 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two,
// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
// writes only the actual data back to the destination matrix. Again, the ld value and offset can
// be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void CopyMatrix(const int src_one, const int src_two,
- const int src_ld, const int src_offset,
- __global const real* restrict src,
- const int dest_one, const int dest_two,
- const int dest_ld, const int dest_offset,
- __global real* dest,
- const real_arg arg_alpha,
- const int upper, const int lower,
- const int diagonal_imag_zero) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyMatrix(const int src_one, const int src_two,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_one, const int dest_two,
+ const int dest_ld, const int dest_offset,
+ __global real* dest,
+ const real_arg arg_alpha,
+ const int upper, const int lower,
+ const int diagonal_imag_zero) {
const real alpha = GetRealArg(arg_alpha);
// Loops over the work per thread in both dimensions
diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl
index ea343533..70156d3a 100644
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -36,11 +36,11 @@ R"(
// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
// offset. A more general version is available in 'padtranspose.opencl'.
-__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
-__kernel void TransposeMatrixFast(const int ld,
- __global const realT* restrict src,
- __global realT* dest,
- const real_arg arg_alpha) {
+__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+void TransposeMatrixFast(const int ld,
+ __global const realT* restrict src,
+ __global realT* dest,
+ const real_arg arg_alpha) {
const real alpha = GetRealArg(arg_alpha);
// Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index 2e20d667..ba0b7062 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -24,15 +24,15 @@ R"(
// Transposes a matrix from source to destination. The output is padded with zero values in case the
// destination matrix dimensions are larger than the transposed source matrix dimensions.
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void TransposePadMatrix(const int src_one, const int src_two,
- const int src_ld, const int src_offset,
- __global const real* restrict src,
- const int dest_one, const int dest_two,
- const int dest_ld, const int dest_offset,
- __global real* dest,
- const real_arg arg_alpha,
- const int do_conjugate) {
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposePadMatrix(const int src_one, const int src_two,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_one, const int dest_two,
+ const int dest_ld, const int dest_offset,
+ __global real* dest,
+ const real_arg arg_alpha,
+ const int do_conjugate) {
const real alpha = GetRealArg(arg_alpha);
// Local memory to store a tile of the matrix (for coalescing)
@@ -88,16 +88,16 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two,
// Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
// padded source matrix, but only the actual data is written back to the transposed destination
// matrix. This kernel optionally checks for upper/lower triangular matrices.
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void TransposeMatrix(const int src_one, const int src_two,
- const int src_ld, const int src_offset,
- __global const real* restrict src,
- const int dest_one, const int dest_two,
- const int dest_ld, const int dest_offset,
- __global real* dest,
- const real_arg arg_alpha,
- const int upper, const int lower,
- const int diagonal_imag_zero) {
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposeMatrix(const int src_one, const int src_two,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_one, const int dest_two,
+ const int dest_ld, const int dest_offset,
+ __global real* dest,
+ const real_arg arg_alpha,
+ const int upper, const int lower,
+ const int diagonal_imag_zero) {
const real alpha = GetRealArg(arg_alpha);
// Local memory to store a tile of the matrix (for coalescing)
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 1ad0a558..d0ce06ad 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -31,7 +31,7 @@
// o-------o o-----o
//
//
-// This kernel is seperated into two files. This is part 1 out of 2.
+// This kernel is seperated into three files. This is part 1 out of 3.
//
// =================================================================================================
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index 87e28cb5..e8234a29 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -7,7 +7,7 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
-// This is part 2 of 2 of the GEMM kernel. See part 1 for more information.
+// This is part 2 of 3 of the GEMM kernel. See part 1 for more information.
//
// =================================================================================================
@@ -133,260 +133,98 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int
#endif
int idm = mg + GetGroupID0() * (MWG/VWM);
int idn = ng + GetGroupID1() * NWG;
-
- // The final multiplication with alpha and the addition with beta*C
int index = idn*(kSizeM/VWM) + idm;
+
realM result;
realM xval = cpm[ni][mi];
- realM yval = cgm[index];
- #if VWM == 1
- AXPBY(result, alpha, xval, beta, yval);
- #elif VWM == 2
- AXPBY(result.x, alpha, xval.x, beta, yval.x);
- AXPBY(result.y, alpha, xval.y, beta, yval.y);
- #elif VWM == 4
- AXPBY(result.x, alpha, xval.x, beta, yval.x);
- AXPBY(result.y, alpha, xval.y, beta, yval.y);
- AXPBY(result.z, alpha, xval.z, beta, yval.z);
- AXPBY(result.w, alpha, xval.w, beta, yval.w);
- #elif VWM == 8
- AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
- AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
- AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
- AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
- AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
- AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
- AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
- AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
- #elif VWM == 16
- AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
- AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
- AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
- AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
- AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
- AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
- AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
- AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
- AXPBY(result.s8, alpha, xval.s8, beta, yval.s8);
- AXPBY(result.s9, alpha, xval.s9, beta, yval.s9);
- AXPBY(result.sA, alpha, xval.sA, beta, yval.sA);
- AXPBY(result.sB, alpha, xval.sB, beta, yval.sB);
- AXPBY(result.sC, alpha, xval.sC, beta, yval.sC);
- AXPBY(result.sD, alpha, xval.sD, beta, yval.sD);
- AXPBY(result.sE, alpha, xval.sE, beta, yval.sE);
- AXPBY(result.sF, alpha, xval.sF, beta, yval.sF);
- #endif
- cgm[index] = result;
- }
- }
-}
-
-// =================================================================================================
-
-// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
-inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
- const __global realM* restrict agm, const __global realN* restrict bgm,
- __global realM* cgm, realM cpm[NWI][MWI/VWM]
- #if SA == 1 && SB == 1
- , __local realM* alm, __local realN* blm
- #elif SA == 1
- , __local realM* alm
- #elif SB == 1
- , __local realN* blm
- #endif
- ) {
-
- // Allocates workitem-private memory (registers)
- realM apm[MWI/VWM];
- realN bpm[NWI/VWN];
-
- // Combined thread identifier (volatile to disable caching)
- #if SA == 1 || SB == 1
- volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
- #endif
-
- // Initializes the accumulation registers
- InitAccRegisters(cpm);
-
- // Loops over all workgroup tiles
- for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
- // Loads data: off-chip --> local (matrix A)
- #if SA == 1
- GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
- #endif
- // Loads data: off-chip --> local (matrix B)
- #if SB == 1
- GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
- #endif
- #if SA == 1 || SB == 1
- barrier(CLK_LOCAL_MEM_FENCE);
- #endif
-
- // Loops over all workitem tiles, unrolled by a factor KWI
- for (int pwi=0; pwi<KWG; pwi+=KWI) {
- #pragma unroll
- for (int pit=0; pit<KWI; ++pit) {
- #if SA == 0 || SB == 0
- int idk = kwg + pwi + pit;
- #endif
- #if SA == 1 || SB == 1
- int kg = pwi+pit;
- #endif
-
- // Loads data: local --> private (matrix A)
- #if SA == 1
- LocalToPrivateA(alm, apm, kg);
- // Loads data: off-chip --> private (matrix A)
- #else
- GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
+ // The final multiplication with alpha (in case beta == 0)
+ if (IsZero(beta)) {
+ #if VWM == 1
+ Multiply(result, alpha, xval);
+ #elif VWM == 2
+ Multiply(result.x, alpha, xval.x);
+ Multiply(result.y, alpha, xval.y);
+ #elif VWM == 4
+ Multiply(result.x, alpha, xval.x);
+ Multiply(result.y, alpha, xval.y);
+ Multiply(result.z, alpha, xval.z);
+ Multiply(result.w, alpha, xval.w);
+ #elif VWM == 8
+ Multiply(result.s0, alpha, xval.s0);
+ Multiply(result.s1, alpha, xval.s1);
+ Multiply(result.s2, alpha, xval.s2);
+ Multiply(result.s3, alpha, xval.s3);
+ Multiply(result.s4, alpha, xval.s4);
+ Multiply(result.s5, alpha, xval.s5);
+ Multiply(result.s6, alpha, xval.s6);
+ Multiply(result.s7, alpha, xval.s7);
+ #elif VWM == 16
+ Multiply(result.s0, alpha, xval.s0);
+ Multiply(result.s1, alpha, xval.s1);
+ Multiply(result.s2, alpha, xval.s2);
+ Multiply(result.s3, alpha, xval.s3);
+ Multiply(result.s4, alpha, xval.s4);
+ Multiply(result.s5, alpha, xval.s5);
+ Multiply(result.s6, alpha, xval.s6);
+ Multiply(result.s7, alpha, xval.s7);
+ Multiply(result.s8, alpha, xval.s8);
+ Multiply(result.s9, alpha, xval.s9);
+ Multiply(result.sA, alpha, xval.sA);
+ Multiply(result.sB, alpha, xval.sB);
+ Multiply(result.sC, alpha, xval.sC);
+ Multiply(result.sD, alpha, xval.sD);
+ Multiply(result.sE, alpha, xval.sE);
+ Multiply(result.sF, alpha, xval.sF);
#endif
+ }
- // Loads data: local --> private (matrix B)
- #if SB == 1
- LocalToPrivateB(blm, bpm, kg);
- // Loads data: off-chip --> private (matrix B)
- #else
- GlobalToPrivateB(bgm, bpm, kSizeN, idk);
+ // The final multiplication with alpha and the addition with beta*C
+ else {
+ realM yval = cgm[index];
+ #if VWM == 1
+ AXPBY(result, alpha, xval, beta, yval);
+ #elif VWM == 2
+ AXPBY(result.x, alpha, xval.x, beta, yval.x);
+ AXPBY(result.y, alpha, xval.y, beta, yval.y);
+ #elif VWM == 4
+ AXPBY(result.x, alpha, xval.x, beta, yval.x);
+ AXPBY(result.y, alpha, xval.y, beta, yval.y);
+ AXPBY(result.z, alpha, xval.z, beta, yval.z);
+ AXPBY(result.w, alpha, xval.w, beta, yval.w);
+ #elif VWM == 8
+ AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
+ AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
+ AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
+ AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
+ AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
+ AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
+ AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
+ AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
+ #elif VWM == 16
+ AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
+ AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
+ AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
+ AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
+ AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
+ AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
+ AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
+ AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
+ AXPBY(result.s8, alpha, xval.s8, beta, yval.s8);
+ AXPBY(result.s9, alpha, xval.s9, beta, yval.s9);
+ AXPBY(result.sA, alpha, xval.sA, beta, yval.sA);
+ AXPBY(result.sB, alpha, xval.sB, beta, yval.sB);
+ AXPBY(result.sC, alpha, xval.sC, beta, yval.sC);
+ AXPBY(result.sD, alpha, xval.sD, beta, yval.sD);
+ AXPBY(result.sE, alpha, xval.sE, beta, yval.sE);
+ AXPBY(result.sF, alpha, xval.sF, beta, yval.sF);
#endif
-
- // Performs the accumulation (Cpm += Apm * Bpm)
- MultiplyAccumulate(cpm, apm, bpm);
}
+ cgm[index] = result;
}
- #if SA == 1 || SB == 1
- barrier(CLK_LOCAL_MEM_FENCE);
- #endif
- }
- #if GLOBAL_MEM_FENCE == 1
- barrier(CLK_GLOBAL_MEM_FENCE);
- #endif
-}
-
-// =================================================================================================
-// The upper-triangular and lower-triangular kernels are only used in special cases
-#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
-
-// Main entry point of the kernel. This is the upper-triangular version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
- const real_arg arg_alpha,
- const real_arg arg_beta,
- const __global realM* restrict agm,
- const __global realN* restrict bgm,
- __global realM* cgm) {
- const real alpha = GetRealArg(arg_alpha);
- const real beta = GetRealArg(arg_beta);
-
- // Skip these threads if they do not contain threads contributing to the upper-triangle
- if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
- return;
- }
-
- // Allocates workgroup-private memory (local memory)
- #if SA == 1
- __local realM alm[KWG * MWG/VWM];
- #endif
- #if SB == 1
- __local realN blm[KWG * NWG/VWN];
- #endif
-
- // Computes the matrix-multiplication and stores the result in register memory
- realM cpm[NWI][MWI/VWM];
- #if SA == 1 && SB == 1
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
- #elif SA == 1
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
- #elif SB == 1
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
- #else
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
- #endif
-
- // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
- StoreResults(cgm, cpm, kSizeN, alpha, beta);
-}
-
-// Main entry point of the kernel. This is the lower-triangular version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmLower(const int kSizeN, const int kSizeK,
- const real_arg arg_alpha,
- const real_arg arg_beta,
- const __global realM* restrict agm,
- const __global realN* restrict bgm,
- __global realM* cgm) {
- const real alpha = GetRealArg(arg_alpha);
- const real beta = GetRealArg(arg_beta);
-
- // Skip these threads if they do not contain threads contributing to the lower-triangle
- if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
- return;
}
-
- // Allocates workgroup-private memory (local memory)
- #if SA == 1
- __local realM alm[KWG * MWG/VWM];
- #endif
- #if SB == 1
- __local realN blm[KWG * NWG/VWN];
- #endif
-
- // Computes the matrix-multiplication and stores the result in register memory
- realM cpm[NWI][MWI/VWM];
- #if SA == 1 && SB == 1
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
- #elif SA == 1
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
- #elif SB == 1
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
- #else
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
- #endif
-
- // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
- StoreResults(cgm, cpm, kSizeN, alpha, beta);
-}
-
-// =================================================================================================
-// If not using a triangular version, include the regular kernel
-#else
-
-// Main entry point of the kernel. This is the regular full version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
- const real_arg arg_alpha,
- const real_arg arg_beta,
- const __global realM* restrict agm,
- const __global realN* restrict bgm,
- __global realM* cgm) {
- const real alpha = GetRealArg(arg_alpha);
- const real beta = GetRealArg(arg_beta);
-
- // Allocates workgroup-private memory (local memory)
- #if SA == 1
- __local realM alm[KWG * MWG/VWM];
- #endif
- #if SB == 1
- __local realN blm[KWG * NWG/VWN];
- #endif
-
- // Computes the matrix-multiplication and stores the result in register memory
- realM cpm[NWI][MWI/VWM];
- #if SA == 1 && SB == 1
- XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
- #elif SA == 1
- XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
- #elif SB == 1
- XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
- #else
- XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
- #endif
-
- // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
- StoreResults(cgm, cpm, kSizeM, alpha, beta);
}
-#endif
// =================================================================================================
// End of the C++11 raw string literal
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
new file mode 100644
index 00000000..a5faef5a
--- /dev/null
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -0,0 +1,229 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 3 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
+inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
+ const __global realM* restrict agm, const __global realN* restrict bgm,
+ __global realM* cgm, realM cpm[NWI][MWI/VWM]
+ #if SA == 1 && SB == 1
+ , __local realM* alm, __local realN* blm
+ #elif SA == 1
+ , __local realM* alm
+ #elif SB == 1
+ , __local realN* blm
+ #endif
+ ) {
+
+ // Allocates workitem-private memory (registers)
+ realM apm[MWI/VWM];
+ realN bpm[NWI/VWN];
+
+ // Combined thread identifier (volatile to disable caching)
+ #if SA == 1 || SB == 1
+ volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
+ #endif
+
+ // Initializes the accumulation registers
+ InitAccRegisters(cpm);
+
+ // Loops over all workgroup tiles
+ for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
+
+ // Loads data: off-chip --> local (matrix A)
+ #if SA == 1
+ GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
+ #endif
+ // Loads data: off-chip --> local (matrix B)
+ #if SB == 1
+ GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
+ #endif
+ #if SA == 1 || SB == 1
+ barrier(CLK_LOCAL_MEM_FENCE);
+ #endif
+
+ // Loops over all workitem tiles, unrolled by a factor KWI
+ for (int pwi=0; pwi<KWG; pwi+=KWI) {
+ #pragma unroll
+ for (int pit=0; pit<KWI; ++pit) {
+ #if SA == 0 || SB == 0
+ int idk = kwg + pwi + pit;
+ #endif
+ #if SA == 1 || SB == 1
+ int kg = pwi+pit;
+ #endif
+
+ // Loads data: local --> private (matrix A)
+ #if SA == 1
+ LocalToPrivateA(alm, apm, kg);
+ // Loads data: off-chip --> private (matrix A)
+ #else
+ GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
+ #endif
+
+ // Loads data: local --> private (matrix B)
+ #if SB == 1
+ LocalToPrivateB(blm, bpm, kg);
+ // Loads data: off-chip --> private (matrix B)
+ #else
+ GlobalToPrivateB(bgm, bpm, kSizeN, idk);
+ #endif
+
+ // Performs the accumulation (Cpm += Apm * Bpm)
+ MultiplyAccumulate(cpm, apm, bpm);
+ }
+ }
+ #if SA == 1 || SB == 1
+ barrier(CLK_LOCAL_MEM_FENCE);
+ #endif
+ }
+ #if GLOBAL_MEM_FENCE == 1
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ #endif
+}
+
+// =================================================================================================
+// The upper-triangular and lower-triangular kernels are only used in special cases
+#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
+
+// Main entry point of the kernel. This is the upper-triangular version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmUpper(const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha,
+ const real_arg arg_beta,
+ const __global realM* restrict agm,
+ const __global realN* restrict bgm,
+ __global realM* cgm) {
+ const real alpha = GetRealArg(arg_alpha);
+ const real beta = GetRealArg(arg_beta);
+
+ // Skip these threads if they do not contain threads contributing to the upper-triangle
+ if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
+ return;
+ }
+
+ // Allocates workgroup-private memory (local memory)
+ #if SA == 1
+ __local realM alm[KWG * MWG/VWM];
+ #endif
+ #if SB == 1
+ __local realN blm[KWG * NWG/VWN];
+ #endif
+
+ // Computes the matrix-multiplication and stores the result in register memory
+ realM cpm[NWI][MWI/VWM];
+ #if SA == 1 && SB == 1
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+ #elif SA == 1
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+ #elif SB == 1
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+ #else
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+ #endif
+
+ // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+ StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// Main entry point of the kernel. This is the lower-triangular version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmLower(const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha,
+ const real_arg arg_beta,
+ const __global realM* restrict agm,
+ const __global realN* restrict bgm,
+ __global realM* cgm) {
+ const real alpha = GetRealArg(arg_alpha);
+ const real beta = GetRealArg(arg_beta);
+
+ // Skip these threads if they do not contain threads contributing to the lower-triangle
+ if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
+ return;
+ }
+
+ // Allocates workgroup-private memory (local memory)
+ #if SA == 1
+ __local realM alm[KWG * MWG/VWM];
+ #endif
+ #if SB == 1
+ __local realN blm[KWG * NWG/VWN];
+ #endif
+
+ // Computes the matrix-multiplication and stores the result in register memory
+ realM cpm[NWI][MWI/VWM];
+ #if SA == 1 && SB == 1
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+ #elif SA == 1
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+ #elif SB == 1
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+ #else
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+ #endif
+
+ // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+ StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// =================================================================================================
+// If not using a triangular version, include the regular kernel
+#else
+
+// Main entry point of the kernel. This is the regular full version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha,
+ const real_arg arg_beta,
+ const __global realM* restrict agm,
+ const __global realN* restrict bgm,
+ __global realM* cgm) {
+ const real alpha = GetRealArg(arg_alpha);
+ const real beta = GetRealArg(arg_beta);
+
+ // Allocates workgroup-private memory (local memory)
+ #if SA == 1
+ __local realM alm[KWG * MWG/VWM];
+ #endif
+ #if SB == 1
+ __local realN blm[KWG * NWG/VWN];
+ #endif
+
+ // Computes the matrix-multiplication and stores the result in register memory
+ realM cpm[NWI][MWI/VWM];
+ #if SA == 1 && SB == 1
+ XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+ #elif SA == 1
+ XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+ #elif SB == 1
+ XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+ #else
+ XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+ #endif
+
+ // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+ StoreResults(cgm, cpm, kSizeM, alpha, beta);
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================