8 files changed, 95 insertions, 95 deletions
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl
index 272905eb..ed2ded98 100644
--- a/src/kernels/level3/convert_hermitian.opencl
+++ b/src/kernels/level3/convert_hermitian.opencl
@@ -22,11 +22,11 @@ R"(
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
 __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
 void HermLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -61,11 +61,11 @@ void HermLowerToSquared(const int src_dim,
 // Same as above, but now the matrix' data is stored in the upper-triangle
 __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
 void HermUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl
index ea6f7dbd..8ae53b37 100644
--- a/src/kernels/level3/convert_symmetric.opencl
+++ b/src/kernels/level3/convert_symmetric.opencl
@@ -22,11 +22,11 @@ R"(
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
 __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
 void SymmLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -55,11 +55,11 @@ void SymmLowerToSquared(const int src_dim,
 // Same as above, but now the matrix' data is stored in the upper-triangle
 __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
 void SymmUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl
index 858228bb..f848dcc1 100644
--- a/src/kernels/level3/convert_triangular.opencl
+++ b/src/kernels/level3/convert_triangular.opencl
@@ -22,12 +22,12 @@ R"(
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
 __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
 void TriaLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int unit_diagonal) {
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const int unit_diagonal) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -57,12 +57,12 @@ void TriaLowerToSquared(const int src_dim,
 // Same as above, but now the matrix' data is stored in the upper-triangle
 __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
 void TriaUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int unit_diagonal) {
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const int unit_diagonal) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
index 54f9d987..695b9003 100644
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@@ -37,10 +37,10 @@ R"(
 // COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
 __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
 void CopyMatrixFast(const int ld,
-                             __global const realC* restrict src,
-                             __global realC* dest,
-                             const __constant real* restrict arg_alpha) {
-  const real alpha = arg_alpha[0];
+                    __global const realC* restrict src,
+                    __global realC* dest,
+                    const real_arg arg_alpha) {
+  const real alpha = GetRealArg(arg_alpha);
   #pragma unroll
   for (int w_one=0; w_one<COPY_WPT; ++w_one) {
     const int id_one = get_global_id(0);
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index 92279ecf..29480b25 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -26,14 +26,14 @@ R"(
 // value and offset can be different.
 __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
 void CopyPadMatrix(const int src_one, const int src_two,
-                            const int src_ld, const int src_offset,
-                            __global const real* restrict src,
-                            const int dest_one, const int dest_two,
-                            const int dest_ld, const int dest_offset,
-                            __global real* dest,
-                            const __constant real* restrict arg_alpha,
-                            const int do_conjugate) {
-  const real alpha = arg_alpha[0];
+                   const int src_ld, const int src_offset,
+                   __global const real* restrict src,
+                   const int dest_one, const int dest_two,
+                   const int dest_ld, const int dest_offset,
+                   __global real* dest,
+                   const real_arg arg_alpha,
+                   const int do_conjugate) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -67,15 +67,15 @@ void CopyPadMatrix(const int src_one, const int src_two,
 // be different.
 __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
 void CopyMatrix(const int src_one, const int src_two,
-                         const int src_ld, const int src_offset,
-                         __global const real* restrict src,
-                         const int dest_one, const int dest_two,
-                         const int dest_ld, const int dest_offset,
-                         __global real* dest,
-                         const __constant real* restrict arg_alpha,
-                         const int upper, const int lower,
-                         const int diagonal_imag_zero) {
-  const real alpha = arg_alpha[0];
+                const int src_ld, const int src_offset,
+                __global const real* restrict src,
+                const int dest_one, const int dest_two,
+                const int dest_ld, const int dest_offset,
+                __global real* dest,
+                const real_arg arg_alpha,
+                const int upper, const int lower,
+                const int diagonal_imag_zero) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl
index a2007408..70156d3a 100644
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -38,10 +38,10 @@ R"(
 // offset. A more general version is available in 'padtranspose.opencl'.
 __kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
 void TransposeMatrixFast(const int ld,
-                                  __global const realT* restrict src,
-                                  __global realT* dest,
-                                  const __constant real* restrict arg_alpha) {
-  const real alpha = arg_alpha[0];
+                         __global const realT* restrict src,
+                         __global realT* dest,
+                         const real_arg arg_alpha) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
   // way over workgroups, breaking memory-bank dependencies.
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index 63cc6e9a..ba0b7062 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -26,14 +26,14 @@ R"(
 // destination matrix dimensions are larger than the transposed source matrix dimensions.
 __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
 void TransposePadMatrix(const int src_one, const int src_two,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_one, const int dest_two,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const __constant real* restrict arg_alpha,
-                                 const int do_conjugate) {
-  const real alpha = arg_alpha[0];
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_one, const int dest_two,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const real_arg arg_alpha,
+                        const int do_conjugate) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Local memory to store a tile of the matrix (for coalescing)
   __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@@ -90,15 +90,15 @@ void TransposePadMatrix(const int src_one, const int src_two,
 // matrix. This kernel optionally checks for upper/lower triangular matrices.
 __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
 void TransposeMatrix(const int src_one, const int src_two,
-                              const int src_ld, const int src_offset,
-                              __global const real* restrict src,
-                              const int dest_one, const int dest_two,
-                              const int dest_ld, const int dest_offset,
-                              __global real* dest,
-                              const __constant real* restrict arg_alpha,
-                              const int upper, const int lower,
-                              const int diagonal_imag_zero) {
-  const real alpha = arg_alpha[0];
+                     const int src_ld, const int src_offset,
+                     __global const real* restrict src,
+                     const int dest_one, const int dest_two,
+                     const int dest_ld, const int dest_offset,
+                     __global real* dest,
+                     const real_arg arg_alpha,
+                     const int upper, const int lower,
+                     const int diagonal_imag_zero) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Local memory to store a tile of the matrix (for coalescing)
   __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index 60e38c06..a1559b54 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -270,13 +270,13 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
 // Main entry point of the kernel. This is the upper-triangular version.
 __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 void XgemmUpper(const int kSizeN, const int kSizeK,
-                         const __constant real* restrict arg_alpha,
-                         const __constant real* restrict arg_beta,
-                         const __global realM* restrict agm,
-                         const __global realN* restrict bgm,
-                         __global realM* cgm) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+                const real_arg arg_alpha,
+                const real_arg arg_beta,
+                const __global realM* restrict agm,
+                const __global realN* restrict bgm,
+                __global realM* cgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Skip these threads if they do not contain threads contributing to the upper-triangle
   if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
@@ -310,13 +310,13 @@ void XgemmUpper(const int kSizeN, const int kSizeK,
 // Main entry point of the kernel. This is the lower-triangular version.
 __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 void XgemmLower(const int kSizeN, const int kSizeK,
-                         const __constant real* restrict arg_alpha,
-                         const __constant real* restrict arg_beta,
-                         const __global realM* restrict agm,
-                         const __global realN* restrict bgm,
-                         __global realM* cgm) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+                const real_arg arg_alpha,
+                const real_arg arg_beta,
+                const __global realM* restrict agm,
+                const __global realN* restrict bgm,
+                __global realM* cgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Skip these threads if they do not contain threads contributing to the lower-triangle
   if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
@@ -354,13 +354,13 @@ void XgemmLower(const int kSizeN, const int kSizeK,
 // Main entry point of the kernel. This is the regular full version.
 __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
-                    const __constant real* restrict arg_alpha,
-                    const __constant real* restrict arg_beta,
-                    const __global realM* restrict agm,
-                    const __global realN* restrict bgm,
-                    __global realM* cgm) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+           const real_arg arg_alpha,
+           const real_arg arg_beta,
+           const __global realM* restrict agm,
+           const __global realN* restrict bgm,
+           __global realM* cgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Allocates workgroup-private memory (local memory)
   #if SA == 1