Prepared the GEMV kernels and tuner for half-precision support

author: Cedric Nugteren <web@cedricnugteren.nl> 2016-05-22 15:22:54 +0200
committer: Cedric Nugteren <web@cedricnugteren.nl> 2016-05-22 15:22:54 +0200
commit: 88551b40055a51800118865940626020d3fc064e (patch)
tree: d9ced489d62d21b26cc45b5c2d400b6ed3e7ce4a /src/kernels
parent: 803aaf3070a6b04095b29100e628a4308bb9dcf7 (diff)
2 files changed, 17 insertions, 3 deletions
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 30b131b4..65b4291f 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -211,13 +211,17 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
 
 // Full version of the kernel
 __attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
+__kernel void Xgemv(const int m, const int n,
+                    const __constant real* restrict arg_alpha,
+                    const __constant real* restrict arg_beta,
                     const int a_rotated,
                     const __global real* restrict agm, const int a_offset, const int a_ld,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc,
                     const int do_conjugate, const int parameter,
                     const int kl, const int ku) {
+  const real alpha = arg_alpha[0];
+  const real beta = arg_beta[0];
 
   // Local memory for the vector X
   __local real xlm[WGS1];
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 61fdffa3..6a494e84 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -95,13 +95,18 @@ inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x,
 // --> 'a_rotated' is 0
 // --> 'do_conjugate' is 0
 __attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XgemvFast(const int m, const int n, const real alpha, const real beta,
+__kernel void XgemvFast(const int m, const int n,
+                        const __constant real* restrict arg_alpha,
+                        const __constant real* restrict arg_beta,
                         const int a_rotated,
                         const __global realVF* restrict agm, const int a_offset, const int a_ld,
                         const __global real* restrict xgm, const int x_offset, const int x_inc,
                         __global real* ygm, const int y_offset, const int y_inc,
                         const int do_conjugate, const int parameter,
                         const int kl, const int ku) {
+  const real alpha = arg_alpha[0];
+  const real beta = arg_beta[0];
+
   // Local memory for the vector X
   __local real xlm[WGS2];
 
@@ -192,13 +197,18 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b
 // --> 'a_rotated' is 1
 // --> 'do_conjugate' is 0
 __attribute__((reqd_work_group_size(WGS3, 1, 1)))
-__kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta,
+__kernel void XgemvFastRot(const int m, const int n,
+                           const __constant real* restrict arg_alpha,
+                           const __constant real* restrict arg_beta,
                            const int a_rotated,
                            const __global realVFR* restrict agm, const int a_offset, const int a_ld,
                            const __global real* restrict xgm, const int x_offset, const int x_inc,
                            __global real* ygm, const int y_offset, const int y_inc,
                            const int do_conjugate, const int parameter,
                            const int kl, const int ku) {
+  const real alpha = arg_alpha[0];
+  const real beta = arg_beta[0];
+
   // Local memory for the vector X
   __local real xlm[WGS3];
author	Cedric Nugteren <web@cedricnugteren.nl>	2016-05-22 15:22:54 +0200
committer	Cedric Nugteren <web@cedricnugteren.nl>	2016-05-22 15:22:54 +0200
commit	88551b40055a51800118865940626020d3fc064e (patch)
tree	d9ced489d62d21b26cc45b5c2d400b6ed3e7ce4a /src/kernels
parent	803aaf3070a6b04095b29100e628a4308bb9dcf7 (diff)