From 88551b40055a51800118865940626020d3fc064e Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 22 May 2016 15:22:54 +0200
Subject: Prepared the GEMV kernels and tuner for half-precision support

---
 src/kernels/level2/xgemv.opencl      |  6 +++++-
 src/kernels/level2/xgemv_fast.opencl | 14 ++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'src/kernels')

diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 30b131b4..65b4291f 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -211,13 +211,17 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
 
 // Full version of the kernel
 __attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
+__kernel void Xgemv(const int m, const int n,
+                    const __constant real* restrict arg_alpha,
+                    const __constant real* restrict arg_beta,
                     const int a_rotated,
                     const __global real* restrict agm, const int a_offset, const int a_ld,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc,
                     const int do_conjugate, const int parameter,
                     const int kl, const int ku) {
+  const real alpha = arg_alpha[0];
+  const real beta = arg_beta[0];
 
   // Local memory for the vector X
   __local real xlm[WGS1];
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 61fdffa3..6a494e84 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -95,13 +95,18 @@ inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x,
 // --> 'a_rotated' is 0
 // --> 'do_conjugate' is 0
 __attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XgemvFast(const int m, const int n, const real alpha, const real beta,
+__kernel void XgemvFast(const int m, const int n,
+                        const __constant real* restrict arg_alpha,
+                        const __constant real* restrict arg_beta,
                         const int a_rotated,
                         const __global realVF* restrict agm, const int a_offset, const int a_ld,
                         const __global real* restrict xgm, const int x_offset, const int x_inc,
                         __global real* ygm, const int y_offset, const int y_inc,
                         const int do_conjugate, const int parameter,
                         const int kl, const int ku) {
+  const real alpha = arg_alpha[0];
+  const real beta = arg_beta[0];
+
   // Local memory for the vector X
   __local real xlm[WGS2];
 
@@ -192,13 +197,18 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b
 // --> 'a_rotated' is 1
 // --> 'do_conjugate' is 0
 __attribute__((reqd_work_group_size(WGS3, 1, 1)))
-__kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta,
+__kernel void XgemvFastRot(const int m, const int n,
+                           const __constant real* restrict arg_alpha,
+                           const __constant real* restrict arg_beta,
                            const int a_rotated,
                            const __global realVFR* restrict agm, const int a_offset, const int a_ld,
                            const __global real* restrict xgm, const int x_offset, const int x_inc,
                            __global real* ygm, const int y_offset, const int y_inc,
                            const int do_conjugate, const int parameter,
                            const int kl, const int ku) {
+  const real alpha = arg_alpha[0];
+  const real beta = arg_beta[0];
+
   // Local memory for the vector X
   __local real xlm[WGS3];
 
-- 
cgit v1.2.3