Now passing alpha/beta to the kernel as arguments as before fp16 support; in case of fp16 arguments are cast on host and in kernel

author: Cedric Nugteren <web@cedricnugteren.nl> 2016-07-10 20:32:01 +0200
committer: Cedric Nugteren <web@cedricnugteren.nl> 2016-07-10 20:32:01 +0200
commit: c87e877bf23d2fe38a7da2898e1734a3cdeaf48c (patch)
tree: d091dfdd826dd11e5c9e533eb46b22aeb7f6f823 /src
parent: 57f09178d89a1cf4f38a0bb338c864ed850d5470 (diff)
33 files changed, 104 insertions, 145 deletions
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 08c47d87..9d2bb65e 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -109,6 +109,16 @@ R"(
   typedef real singlereal;
 #endif
 
+// Converts a 'real argument' value to a 'real' value as passed to the kernel. Normally there is no
+// conversion, but half-precision is not supported as kernel argument so it is converted from float.
+#if PRECISION == 16
+  typedef float real_arg;
+  #define GetRealArg(x) (half)x
+#else
+  typedef real real_arg;
+  #define GetRealArg(x) x
+#endif
+
 // =================================================================================================
 
 // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index e0efadc1..d533041b 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -23,10 +23,10 @@ R"(
 
 // Full version of the kernel with offsets and strided accesses
 __attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
+__kernel void Xaxpy(const int n, const real_arg arg_alpha,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
   #pragma unroll
@@ -41,10 +41,10 @@ __kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
 __attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XaxpyFast(const int n, const __constant real* restrict arg_alpha,
+__kernel void XaxpyFast(const int n, const real_arg arg_alpha,
                         const __global realV* restrict xgm,
                         __global realV* ygm) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 65b4291f..83b6b15d 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -212,16 +212,16 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
 // Full version of the kernel
 __attribute__((reqd_work_group_size(WGS1, 1, 1)))
 __kernel void Xgemv(const int m, const int n,
-                    const __constant real* restrict arg_alpha,
-                    const __constant real* restrict arg_beta,
+                    const real_arg arg_alpha,
+                    const real_arg arg_beta,
                     const int a_rotated,
                     const __global real* restrict agm, const int a_offset, const int a_ld,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc,
                     const int do_conjugate, const int parameter,
                     const int kl, const int ku) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Local memory for the vector X
   __local real xlm[WGS1];
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 6a494e84..1127a0b6 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -96,16 +96,16 @@ inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x,
 // --> 'do_conjugate' is 0
 __attribute__((reqd_work_group_size(WGS2, 1, 1)))
 __kernel void XgemvFast(const int m, const int n,
-                        const __constant real* restrict arg_alpha,
-                        const __constant real* restrict arg_beta,
+                        const real_arg arg_alpha,
+                        const real_arg arg_beta,
                         const int a_rotated,
                         const __global realVF* restrict agm, const int a_offset, const int a_ld,
                         const __global real* restrict xgm, const int x_offset, const int x_inc,
                         __global real* ygm, const int y_offset, const int y_inc,
                         const int do_conjugate, const int parameter,
                         const int kl, const int ku) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Local memory for the vector X
   __local real xlm[WGS2];
@@ -198,16 +198,16 @@ __kernel void XgemvFast(const int m, const int n,
 // --> 'do_conjugate' is 0
 __attribute__((reqd_work_group_size(WGS3, 1, 1)))
 __kernel void XgemvFastRot(const int m, const int n,
-                           const __constant real* restrict arg_alpha,
-                           const __constant real* restrict arg_beta,
+                           const real_arg arg_alpha,
+                           const real_arg arg_beta,
                            const int a_rotated,
                            const __global realVFR* restrict agm, const int a_offset, const int a_ld,
                            const __global real* restrict xgm, const int x_offset, const int x_inc,
                            __global real* ygm, const int y_offset, const int y_inc,
                            const int do_conjugate, const int parameter,
                            const int kl, const int ku) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Local memory for the vector X
   __local real xlm[WGS3];
diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl
index 63817afb..f218a346 100644
--- a/src/kernels/level2/xger.opencl
+++ b/src/kernels/level2/xger.opencl
@@ -20,12 +20,12 @@ R"(
 // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
 __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
 __kernel void Xger(const int max1, const int max2,
-                   const __constant real* restrict arg_alpha,
+                   const real_arg arg_alpha,
                    const __global real* restrict xgm, const int x_offset, const int x_inc,
                    const __global real* ygm, const int y_offset, const int y_inc,
                    __global real* restrict agm, const int a_offset, const int a_ld,
                    const int is_rowmajor) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and Y
   real xvalues[WPT];
diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl
index fc635f2e..1200ee63 100644
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@@ -20,11 +20,11 @@ R"(
 // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
 __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
 __kernel void Xher(const int n,
-                   const __constant real* restrict arg_alpha,
+                   const real_arg arg_alpha,
                    const __global real* restrict xgm, const int x_offset, const int x_inc,
                    __global real* restrict agm, const int a_offset, const int a_ld,
                    const int is_upper, const int is_rowmajor) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and XT
   real xvalues[WPT];
diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl
index a66f255f..d0f41571 100644
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@@ -20,12 +20,12 @@ R"(
 // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
 __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
 __kernel void Xher2(const int n,
-                    const __constant real* restrict arg_alpha,
+                    const real_arg arg_alpha,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     const __global real* restrict ygm, const int y_offset, const int y_inc,
                     __global real* restrict agm, const int a_offset, const int a_ld,
                     const int is_upper, const int is_rowmajor) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and Y
   real xvalues[WPT];
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
index 09e54e6d..dd975bf1 100644
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@@ -39,8 +39,8 @@ __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
 __kernel void CopyMatrixFast(const int ld,
                              __global const realC* restrict src,
                              __global realC* dest,
-                             const __constant real* restrict arg_alpha) {
-  const real alpha = arg_alpha[0];
+                             const real_arg arg_alpha) {
+  const real alpha = GetRealArg(arg_alpha);
   #pragma unroll
   for (int w_one=0; w_one<COPY_WPT; ++w_one) {
     const int id_one = get_global_id(0);
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index d276cc60..d0771c31 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -31,9 +31,9 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two,
                             const int dest_one, const int dest_two,
                             const int dest_ld, const int dest_offset,
                             __global real* dest,
-                            const __constant real* restrict arg_alpha,
+                            const real_arg arg_alpha,
                             const int do_conjugate) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -72,10 +72,10 @@ __kernel void CopyMatrix(const int src_one, const int src_two,
                          const int dest_one, const int dest_two,
                          const int dest_ld, const int dest_offset,
                          __global real* dest,
-                         const __constant real* restrict arg_alpha,
+                         const real_arg arg_alpha,
                          const int upper, const int lower,
                          const int diagonal_imag_zero) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl
index d5c46a30..ea343533 100644
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -40,8 +40,8 @@ __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
 __kernel void TransposeMatrixFast(const int ld,
                                   __global const realT* restrict src,
                                   __global realT* dest,
-                                  const __constant real* restrict arg_alpha) {
-  const real alpha = arg_alpha[0];
+                                  const real_arg arg_alpha) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
   // way over workgroups, breaking memory-bank dependencies.
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index 2de0c7bd..2e20d667 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -31,9 +31,9 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two,
                                  const int dest_one, const int dest_two,
                                  const int dest_ld, const int dest_offset,
                                  __global real* dest,
-                                 const __constant real* restrict arg_alpha,
+                                 const real_arg arg_alpha,
                                  const int do_conjugate) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Local memory to store a tile of the matrix (for coalescing)
   __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@@ -95,10 +95,10 @@ __kernel void TransposeMatrix(const int src_one, const int src_two,
                               const int dest_one, const int dest_two,
                               const int dest_ld, const int dest_offset,
                               __global real* dest,
-                              const __constant real* restrict arg_alpha,
+                              const real_arg arg_alpha,
                               const int upper, const int lower,
                               const int diagonal_imag_zero) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Local memory to store a tile of the matrix (for coalescing)
   __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index 42c1127c..87e28cb5 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -270,13 +270,13 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
 // Main entry point of the kernel. This is the upper-triangular version.
 __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
-                         const __constant real* restrict arg_alpha,
-                         const __constant real* restrict arg_beta,
+                         const real_arg arg_alpha,
+                         const real_arg arg_beta,
                          const __global realM* restrict agm,
                          const __global realN* restrict bgm,
                          __global realM* cgm) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Skip these threads if they do not contain threads contributing to the upper-triangle
   if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
@@ -310,13 +310,13 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
 // Main entry point of the kernel. This is the lower-triangular version.
 __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 __kernel void XgemmLower(const int kSizeN, const int kSizeK,
-                         const __constant real* restrict arg_alpha,
-                         const __constant real* restrict arg_beta,
+                         const real_arg arg_alpha,
+                         const real_arg arg_beta,
                          const __global realM* restrict agm,
                          const __global realN* restrict bgm,
                          __global realM* cgm) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Skip these threads if they do not contain threads contributing to the lower-triangle
   if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
@@ -354,13 +354,13 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK,
 // Main entry point of the kernel. This is the regular full version.
 __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
-                    const __constant real* restrict arg_alpha,
-                    const __constant real* restrict arg_beta,
+                    const real_arg arg_alpha,
+                    const real_arg arg_beta,
                     const __global realM* restrict agm,
                     const __global realN* restrict bgm,
                     __global realM* cgm) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Allocates workgroup-private memory (local memory)
   #if SA == 1
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index c99cd39d..e624a2b1 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -88,10 +88,6 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont
     }
   }
 
-  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context, 1);
-  alpha_buffer.Write(queue, 1, &alpha);
-
   // Retrieves the kernel from the compiled binary
   try {
     auto kernel = Kernel(program, kernel_name);
@@ -101,7 +97,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont
       kernel.SetArgument(0, static_cast<int>(src_ld));
       kernel.SetArgument(1, src());
       kernel.SetArgument(2, dest());
-      kernel.SetArgument(3, alpha_buffer());
+      kernel.SetArgument(3, GetRealArg(alpha));
     }
     else {
       kernel.SetArgument(0, static_cast<int>(src_one));
@@ -114,7 +110,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont
       kernel.SetArgument(7, static_cast<int>(dest_ld));
       kernel.SetArgument(8, static_cast<int>(dest_offset));
       kernel.SetArgument(9, dest());
-      kernel.SetArgument(10, alpha_buffer());
+      kernel.SetArgument(10, GetRealArg(alpha));
       if (do_pad) {
         kernel.SetArgument(11, static_cast<int>(do_conjugate));
       }
diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp
index 5b6c9e77..3445e2b5 100644
--- a/src/routines/level1/xaxpy.cpp
+++ b/src/routines/level1/xaxpy.cpp
@@ -59,20 +59,16 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel = Kernel(program, kernel_name);
 
-    // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &alpha);
-
     // Sets the kernel arguments
     if (use_fast_kernel) {
       kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha_buffer());
+      kernel.SetArgument(1, GetRealArg(alpha));
       kernel.SetArgument(2, x_buffer());
       kernel.SetArgument(3, y_buffer());
     }
     else {
       kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha_buffer());
+      kernel.SetArgument(1, GetRealArg(alpha));
       kernel.SetArgument(2, x_buffer());
       kernel.SetArgument(3, static_cast<int>(x_offset));
       kernel.SetArgument(4, static_cast<int>(x_inc));
diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp
index 21fb397c..2842ef07 100644
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@@ -126,12 +126,6 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
     local_size = db_["WGS3"];
   }
 
-  // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context_, 1);
-  auto beta_buffer = Buffer<T>(context_, 1);
-  alpha_buffer.Write(queue_, 1, &alpha);
-  beta_buffer.Write(queue_, 1, &beta);
-
   // Retrieves the Xgemv kernel from the compiled binary
   try {
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@@ -140,8 +134,8 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(m_real));
     kernel.SetArgument(1, static_cast<int>(n_real));
-    kernel.SetArgument(2, alpha_buffer());
-    kernel.SetArgument(3, beta_buffer());
+    kernel.SetArgument(2, GetRealArg(alpha));
+    kernel.SetArgument(3, GetRealArg(beta));
     kernel.SetArgument(4, static_cast<int>(a_rotated));
     kernel.SetArgument(5, a_buffer());
     kernel.SetArgument(6, static_cast<int>(a_offset));
diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp
index 353047d2..29cffe0c 100644
--- a/src/routines/level2/xger.cpp
+++ b/src/routines/level2/xger.cpp
@@ -56,10 +56,6 @@ StatusCode Xger<T>::DoGer(const Layout layout,
   status = TestVectorY(n, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
 
-  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context_, 1);
-  alpha_buffer.Write(queue_, 1, &alpha);
-
   // Retrieves the kernel from the compiled binary
   try {
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@@ -68,7 +64,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(a_one));
     kernel.SetArgument(1, static_cast<int>(a_two));
-    kernel.SetArgument(2, alpha_buffer());
+    kernel.SetArgument(2, GetRealArg(alpha));
     kernel.SetArgument(3, x_buffer());
     kernel.SetArgument(4, static_cast<int>(x_offset));
     kernel.SetArgument(5, static_cast<int>(x_inc));
diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp
index ed8ba9e9..6dd95938 100644
--- a/src/routines/level2/xher.cpp
+++ b/src/routines/level2/xher.cpp
@@ -70,10 +70,6 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
   // Creates a matching version of alpha
   const auto matching_alpha = GetAlpha(alpha);
 
-  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context_, 1);
-  alpha_buffer.Write(queue_, 1, &matching_alpha);
-
   // Retrieves the kernel from the compiled binary
   try {
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@@ -81,7 +77,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
 
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, alpha_buffer());
+    kernel.SetArgument(1, GetRealArg(matching_alpha));
     kernel.SetArgument(2, x_buffer());
     kernel.SetArgument(3, static_cast<int>(x_offset));
     kernel.SetArgument(4, static_cast<int>(x_inc));
diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp
index 50572cea..3d57a9b9 100644
--- a/src/routines/level2/xher2.cpp
+++ b/src/routines/level2/xher2.cpp
@@ -58,10 +58,6 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
   status = TestVectorY(n, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
 
-  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context_, 1);
-  alpha_buffer.Write(queue_, 1, &alpha);
-
   // Retrieves the kernel from the compiled binary
   try {
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@@ -69,7 +65,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
 
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, alpha_buffer());
+    kernel.SetArgument(1, GetRealArg(alpha));
     kernel.SetArgument(2, x_buffer());
     kernel.SetArgument(3, static_cast<int>(x_offset));
     kernel.SetArgument(4, static_cast<int>(x_inc));
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 9ea5559c..97e8db7e 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -118,12 +118,6 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
     const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled);
 
-    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    auto beta_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &alpha);
-    beta_buffer.Write(queue_, 1, &beta);
-
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
     auto emptyEventList = std::vector<Event>();
@@ -174,8 +168,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
       kernel.SetArgument(0, static_cast<int>(m_ceiled));
       kernel.SetArgument(1, static_cast<int>(n_ceiled));
       kernel.SetArgument(2, static_cast<int>(k_ceiled));
-      kernel.SetArgument(3, alpha_buffer());
-      kernel.SetArgument(4, beta_buffer());
+      kernel.SetArgument(3, GetRealArg(alpha));
+      kernel.SetArgument(4, GetRealArg(beta));
       kernel.SetArgument(5, a_temp());
       kernel.SetArgument(6, b_temp());
       kernel.SetArgument(7, c_temp());
diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp
index bd7a053e..65e2be55 100644
--- a/src/routines/level3/xher2k.cpp
+++ b/src/routines/level3/xher2k.cpp
@@ -107,12 +107,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
-    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+    // Convert the arguments to complex versions
     auto complex_beta = T{beta, static_cast<U>(0.0)};
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    auto beta_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &alpha);
-    beta_buffer.Write(queue_, 1, &complex_beta);
 
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
@@ -180,8 +176,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
       // Sets the kernel arguments
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha_buffer());
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(2, GetRealArg(alpha));
+      kernel.SetArgument(3, GetRealArg(complex_beta));
       kernel.SetArgument(4, a1_temp());
       kernel.SetArgument(5, b2_temp());
       kernel.SetArgument(6, c_temp());
@@ -202,10 +198,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
       // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
       auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
       auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
-      alpha_buffer.Write(queue_, 1, &conjugate_alpha);
-      beta_buffer.Write(queue_, 1, &complex_one);
-      kernel.SetArgument(2, alpha_buffer());
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(2, GetRealArg(conjugate_alpha));
+      kernel.SetArgument(3, GetRealArg(complex_one));
       kernel.SetArgument(4, b1_temp());
       kernel.SetArgument(5, a2_temp());
 
diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp
index 6ef7f21f..cc87e3e9 100644
--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@@ -98,13 +98,9 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
     auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
-    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+    // Convert the arguments to complex versions
     auto complex_alpha = T{alpha, static_cast<U>(0.0)};
     auto complex_beta = T{beta, static_cast<U>(0.0)};
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    auto beta_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &complex_alpha);
-    beta_buffer.Write(queue_, 1, &complex_beta);
 
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
@@ -152,8 +148,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
       // Sets the kernel arguments
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha_buffer());
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(2, GetRealArg(complex_alpha));
+      kernel.SetArgument(3, GetRealArg(complex_beta));
       kernel.SetArgument(4, a_temp());
       kernel.SetArgument(5, b_temp());
       kernel.SetArgument(6, c_temp());
diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp
index 424d4d2d..18a1eac7 100644
--- a/src/routines/level3/xsyr2k.cpp
+++ b/src/routines/level3/xsyr2k.cpp
@@ -97,12 +97,6 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
     auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
-    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    auto beta_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &alpha);
-    beta_buffer.Write(queue_, 1, &beta);
-
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
     auto emptyEventList = std::vector<Event>();
@@ -149,8 +143,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
       // Sets the kernel arguments
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha_buffer());
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(2, GetRealArg(alpha));
+      kernel.SetArgument(3, GetRealArg(beta));
       kernel.SetArgument(4, a_temp());
       kernel.SetArgument(5, b_temp());
       kernel.SetArgument(6, c_temp());
@@ -170,8 +164,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
 
       // Swaps the arguments for matrices A and B, and sets 'beta' to 1
       auto one = static_cast<T>(1);
-      beta_buffer.Write(queue_, 1, &one);
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(3, GetRealArg(one));
       kernel.SetArgument(4, b_temp());
       kernel.SetArgument(5, a_temp());
 
diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp
index f56c232b..1992cec1 100644
--- a/src/routines/level3/xsyrk.cpp
+++ b/src/routines/level3/xsyrk.cpp
@@ -90,12 +90,6 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
     auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
-    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    auto beta_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &alpha);
-    beta_buffer.Write(queue_, 1, &beta);
-
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
     auto emptyEventList = std::vector<Event>();
@@ -132,8 +126,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
       // Sets the kernel arguments
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha_buffer());
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(2, GetRealArg(alpha));
+      kernel.SetArgument(3, GetRealArg(beta));
       kernel.SetArgument(4, a_temp());
       kernel.SetArgument(5, a_temp());
       kernel.SetArgument(6, c_temp());
diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp
index 34269bc7..78ded56e 100644
--- a/src/tuning/kernels/copy_fast.cpp
+++ b/src/tuning/kernels/copy_fast.cpp
@@ -86,11 +86,10 @@ class TuneCopy {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
   }
 
   // Describes how to compute the performance metrics
diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp
index 1e0dccd3..90f5ea82 100644
--- a/src/tuning/kernels/copy_pad.cpp
+++ b/src/tuning/kernels/copy_pad.cpp
@@ -86,7 +86,6 @@ class TunePad {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(static_cast<int>(args.m));
@@ -97,7 +96,7 @@ class TunePad {
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(0);
     tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
     tuner.AddArgumentScalar(0);
   }
 
diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp
index 7ac19cb6..10fa80cb 100644
--- a/src/tuning/kernels/transpose_fast.cpp
+++ b/src/tuning/kernels/transpose_fast.cpp
@@ -91,11 +91,10 @@ class TuneTranspose {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
   }
 
   // Describes how to compute the performance metrics
diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp
index 63274415..507718eb 100644
--- a/src/tuning/kernels/transpose_pad.cpp
+++ b/src/tuning/kernels/transpose_pad.cpp
@@ -90,7 +90,6 @@ class TunePadTranspose {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(static_cast<int>(args.m));
@@ -101,7 +100,7 @@ class TunePadTranspose {
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(0);
     tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
     tuner.AddArgumentScalar(0);
   }
 
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index 88d12c1f..0033b3c6 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -89,9 +89,8 @@ class TuneXaxpy {
                            std::vector<T> &x_vec, std::vector<T> &y_vec,
                            std::vector<T> &, std::vector<T> &, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
     tuner.AddArgumentInput(x_vec);
     tuner.AddArgumentOutput(y_vec);
   }
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 4b1efdef..898b8435 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -121,13 +121,11 @@ class TuneXgemm {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
-    auto beta_buffer = std::vector<T>{args.beta};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(static_cast<int>(args.k));
-    tuner.AddArgumentInput(alpha_buffer);
-    tuner.AddArgumentInput(beta_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
+    tuner.AddArgumentScalar(GetRealArg(args.beta));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentInput(b_mat);
     tuner.AddArgumentOutput(c_mat);
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index d42155ae..5c187d33 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -96,13 +96,11 @@ class TuneXgemv {
                            std::vector<T> &x_vec, std::vector<T> &y_vec,
                            std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
-    auto beta_buffer = std::vector<T>{args.beta};
     auto a_rotated = (V==3) ? 1 : 0;
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentInput(alpha_buffer);
-    tuner.AddArgumentInput(beta_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
+    tuner.AddArgumentScalar(GetRealArg(args.beta));
     tuner.AddArgumentScalar(static_cast<int>(a_rotated));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentScalar(0);
diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp
index d2590c53..1fb5c531 100644
--- a/src/tuning/kernels/xger.cpp
+++ b/src/tuning/kernels/xger.cpp
@@ -85,10 +85,9 @@ class TuneXger {
                            std::vector<T> &x_vec, std::vector<T> &y_vec,
                            std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
     tuner.AddArgumentInput(x_vec);
     tuner.AddArgumentScalar(0); // x_offset
     tuner.AddArgumentScalar(1); // x_increment
diff --git a/src/utilities.cpp b/src/utilities.cpp
index 68e480c5..11a6c439 100644
--- a/src/utilities.cpp
+++ b/src/utilities.cpp
@@ -332,6 +332,14 @@ void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_com
   result.Write(queue, size, result_cpu);
 }
 
+// Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is
+// no conversion, but half-precision is not supported as kernel argument so it is converted to float.
+template <> typename RealArg<half>::Type GetRealArg(const half value) { return HalfToFloat(value); }
+template <> typename RealArg<float>::Type GetRealArg(const float value) { return value; }
+template <> typename RealArg<double>::Type GetRealArg(const double value) { return value; }
+template <> typename RealArg<float2>::Type GetRealArg(const float2 value) { return value; }
+template <> typename RealArg<double2>::Type GetRealArg(const double2 value) { return value; }
+
 // =================================================================================================
 
 // Rounding functions performing ceiling and division operations
diff --git a/src/utilities.hpp b/src/utilities.hpp
index d5efab9f..700d30d6 100644
--- a/src/utilities.hpp
+++ b/src/utilities.hpp
@@ -227,6 +227,12 @@ void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& sour
 Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw);
 void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw);
 
+// Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is
+// no conversion, but half-precision is not supported as kernel argument so it is converted to float.
+template <typename T> struct RealArg { using Type = T; };
+template <> struct RealArg<half> { using Type = float; };
+template <typename T> typename RealArg<T>::Type GetRealArg(const T value);
+
 // =================================================================================================
 
 // Rounding functions
author	Cedric Nugteren <web@cedricnugteren.nl>	2016-07-10 20:32:01 +0200
committer	Cedric Nugteren <web@cedricnugteren.nl>	2016-07-10 20:32:01 +0200
commit	c87e877bf23d2fe38a7da2898e1734a3cdeaf48c (patch)
tree	d091dfdd826dd11e5c9e533eb46b22aeb7f6f823 /src
parent	57f09178d89a1cf4f38a0bb338c864ed850d5470 (diff)