14 files changed, 97 insertions, 95 deletions
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 2bd2f714..27add015 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -75,7 +75,6 @@ void Xamax(const int n,
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // Performs reduction in local memory
-  #pragma unroll
   for (int s=WGS1/2; s>0; s=s>>1) {
     if (lid < s) {
       if (maxlm[lid + s] >= maxlm[lid]) {
@@ -117,7 +116,6 @@ void XamaxEpilogue(const __global singlereal* restrict maxgm,
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // Performs reduction in local memory
-  #pragma unroll
   for (int s=WGS2/2; s>0; s=s>>1) {
     if (lid < s) {
       if (maxlm[lid + s] >= maxlm[lid]) {
diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl
index 1fc91be8..29e7fa3e 100644
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@@ -56,7 +56,6 @@ void Xasum(const int n,
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // Performs reduction in local memory
-  #pragma unroll
   for (int s=WGS1/2; s>0; s=s>>1) {
     if (lid < s) {
       Add(lm[lid], lm[lid], lm[lid + s]);
@@ -85,7 +84,6 @@ void XasumEpilogue(const __global real* restrict input,
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // Performs reduction in local memory
-  #pragma unroll
   for (int s=WGS2/2; s>0; s=s>>1) {
     if (lid < s) {
       Add(lm[lid], lm[lid], lm[lid + s]);
diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl
index 228e0735..aed80fc2 100644
--- a/src/kernels/level1/xcopy.opencl
+++ b/src/kernels/level1/xcopy.opencl
@@ -28,7 +28,6 @@ void Xcopy(const int n,
            __global real* ygm, const int y_offset, const int y_inc) {
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
-  #pragma unroll
   for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
     ygm[id*y_inc + y_offset] = xgm[id*x_inc + x_offset];
   }
@@ -43,8 +42,8 @@ void XcopyFast(const int n,
                const __global realV* restrict xgm,
                __global realV* ygm) {
   #pragma unroll
-  for (int w=0; w<WPT; ++w) {
-    const int id = w*get_global_size(0) + get_global_id(0);
+  for (int _w = 0; _w < WPT; _w += 1) {
+    const int id = _w*get_global_size(0) + get_global_id(0);
     ygm[id] = xgm[id];
   }
 }
diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl
index f6d869cb..6a81c150 100644
--- a/src/kernels/level1/xnrm2.opencl
+++ b/src/kernels/level1/xnrm2.opencl
@@ -54,7 +54,6 @@ void Xnrm2(const int n,
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // Performs reduction in local memory
-  #pragma unroll
   for (int s=WGS1/2; s>0; s=s>>1) {
     if (lid < s) {
       Add(lm[lid], lm[lid], lm[lid + s]);
@@ -83,7 +82,6 @@ void Xnrm2Epilogue(const __global real* restrict input,
   barrier(CLK_LOCAL_MEM_FENCE);
 
   // Performs reduction in local memory
-  #pragma unroll
   for (int s=WGS2/2; s>0; s=s>>1) {
     if (lid < s) {
       Add(lm[lid], lm[lid], lm[lid + s]);
diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl
index 3da9c2fd..cb133e88 100644
--- a/src/kernels/level1/xscal.opencl
+++ b/src/kernels/level1/xscal.opencl
@@ -28,7 +28,6 @@ void Xscal(const int n, const real_arg arg_alpha,
   const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
-  #pragma unroll
   for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
     real xvalue = xgm[id*x_inc + x_offset];
     real result;
@@ -47,8 +46,8 @@ void XscalFast(const int n, const real_arg arg_alpha,
   const real alpha = GetRealArg(arg_alpha);
 
   #pragma unroll
-  for (int w=0; w<WPT; ++w) {
-    const int id = w*get_global_size(0) + get_global_id(0);
+  for (int _w = 0; _w < WPT; _w += 1) {
+    const int id = _w*get_global_size(0) + get_global_id(0);
     realV xvalue = xgm[id];
     realV result;
     result = MultiplyVector(result, alpha, xvalue);
diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl
index 267271c0..bf5b6194 100644
--- a/src/kernels/level1/xswap.opencl
+++ b/src/kernels/level1/xswap.opencl
@@ -28,7 +28,6 @@ void Xswap(const int n,
            __global real* ygm, const int y_offset, const int y_inc) {
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
-  #pragma unroll
   for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
     real temp = xgm[id*x_inc + x_offset];
     xgm[id*x_inc + x_offset] = ygm[id*y_inc + y_offset];
@@ -45,8 +44,8 @@ void XswapFast(const int n,
                __global realV* xgm,
                __global realV* ygm) {
   #pragma unroll
-  for (int w=0; w<WPT; ++w) {
-    const int id = w*get_global_size(0) + get_global_id(0);
+  for (int _w = 0; _w < WPT; _w += 1) {
+    const int id = _w*get_global_size(0) + get_global_id(0);
     realV temp = xgm[id];
     xgm[id] = ygm[id];
     ygm[id] = temp;
diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl
index b0772218..8a57bdfc 100644
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@@ -27,32 +27,34 @@ void Xher(const int n,
   const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and XT
+  #pragma promote_to_registers
   real xvalues[WPT];
+  #pragma promote_to_registers
   real xtvalues[WPT];
 
   // Loads the X-vector
   #pragma unroll
-  for (int w=0; w<WPT; ++w) {
-    const int id2 = w*get_global_size(1) + get_global_id(1);
-    xvalues[w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
+  for (int _w = 0; _w < WPT; _w += 1) {
+    const int id2 = _w*get_global_size(1) + get_global_id(1);
+    xvalues[_w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
   }
 
   // Loads the X-transposed-vector
   #pragma unroll
-  for (int w=0; w<WPT; ++w) {
-    const int id1 = w*get_global_size(0) + get_global_id(0);
-    xtvalues[w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
+  for (int _w = 0; _w < WPT; _w += 1) {
+    const int id1 = _w*get_global_size(0) + get_global_id(0);
+    xtvalues[_w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
   }
 
   // Loops over the work per thread twice
   #pragma unroll
-  for (int w1=0; w1<WPT; ++w1) {
+  for (int _w1 = 0; _w1 < WPT; _w1 += 1) {
     #pragma unroll
-    for (int w2=0; w2<WPT; ++w2) {
+    for (int _w2 = 0; _w2 < WPT; _w2 += 1) {
 
       // Global thread IDs
-      const int id1 = w1*get_global_size(0) + get_global_id(0);
-      const int id2 = w2*get_global_size(1) + get_global_id(1);
+      const int id1 = _w1*get_global_size(0) + get_global_id(0);
+      const int id2 = _w2*get_global_size(1) + get_global_id(1);
 
       // Skip these threads if they do not contain threads contributing to the matrix-triangle
       if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) {
@@ -61,7 +63,7 @@ void Xher(const int n,
 
       // Loads A, performs the operation, and stores the result into A
       else {
-        MatrixUpdate(id1, id2, n, n, agm, a_offset, a_ld, alpha, xvalues[w2], xtvalues[w1], is_upper);
+        MatrixUpdate(id1, id2, n, n, agm, a_offset, a_ld, alpha, xvalues[_w2], xtvalues[_w1], is_upper);
       }
     }
   }
diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl
index 00a756c9..73305149 100644
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@@ -28,37 +28,41 @@ void Xher2(const int n,
   const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and Y
+  #pragma promote_to_registers
   real xvalues[WPT];
+  #pragma promote_to_registers
   real yvalues[WPT];
+  #pragma promote_to_registers
   real xtvalues[WPT];
+  #pragma promote_to_registers
   real ytvalues[WPT];
 
   // Loads the X-vector
   #pragma unroll
-  for (int w=0; w<WPT; ++w) {
-    const int id2 = w*get_global_size(1) + get_global_id(1);
-    xvalues[w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
+  for (int _w = 0; _w < WPT; _w += 1) {
+    const int id2 = _w*get_global_size(1) + get_global_id(1);
+    xvalues[_w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
   }
 
   // Loads the X-transposed-vector
   #pragma unroll
-  for (int w=0; w<WPT; ++w) {
-    const int id1 = w*get_global_size(0) + get_global_id(0);
-    xtvalues[w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
+  for (int _w = 0; _w < WPT; _w += 1) {
+    const int id1 = _w*get_global_size(0) + get_global_id(0);
+    xtvalues[_w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
   }
 
   // Loads the Y-vector
   #pragma unroll
-  for (int w=0; w<WPT; ++w) {
-    const int id1 = w*get_global_size(0) + get_global_id(0);
-    yvalues[w] = LoadVector(id1, n, ygm, y_offset, y_inc, is_rowmajor);
+  for (int _w = 0; _w < WPT; _w += 1) {
+    const int id1 = _w*get_global_size(0) + get_global_id(0);
+    yvalues[_w] = LoadVector(id1, n, ygm, y_offset, y_inc, is_rowmajor);
   }
 
   // Loads the Y-transposed-vector
   #pragma unroll
-  for (int w=0; w<WPT; ++w) {
-    const int id2 = w*get_global_size(1) + get_global_id(1);
-    ytvalues[w] = LoadVector(id2, n, ygm, y_offset, y_inc, !is_rowmajor);
+  for (int _w = 0; _w < WPT; _w += 1) {
+    const int id2 = _w*get_global_size(1) + get_global_id(1);
+    ytvalues[_w] = LoadVector(id2, n, ygm, y_offset, y_inc, !is_rowmajor);
   }
 
   // Sets the proper value of alpha in case conjugation is needed
@@ -75,13 +79,13 @@ void Xher2(const int n,
 
   // Loops over the work per thread twice
   #pragma unroll
-  for (int w1=0; w1<WPT; ++w1) {
+  for (int _w1 = 0; _w1 < WPT; _w1 += 1) {
     #pragma unroll
-    for (int w2=0; w2<WPT; ++w2) {
+    for (int _w2 = 0; _w2 < WPT; _w2 += 1) {
 
       // Global thread IDs
-      const int id1 = w1*get_global_size(0) + get_global_id(0);
-      const int id2 = w2*get_global_size(1) + get_global_id(1);
+      const int id1 = _w1*get_global_size(0) + get_global_id(0);
+      const int id2 = _w2*get_global_size(1) + get_global_id(1);
 
       // Skip these threads if they do not contain threads contributing to the matrix-triangle
       if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) {
@@ -91,8 +95,8 @@ void Xher2(const int n,
       // Loads A, performs the operation, and stores the result into A
       else {
         MatrixUpdate2(id1, id2, n, n, agm, a_offset, a_ld,
-                      alpha1, xvalues[w2], yvalues[w1],
-                      alpha2, xtvalues[w1], ytvalues[w2], is_upper);
+                      alpha1, xvalues[_w2], yvalues[_w1],
+                      alpha2, xtvalues[_w1], ytvalues[_w2], is_upper);
       }
     }
   }
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl
index ed2ded98..0e89b78b 100644
--- a/src/kernels/level3/convert_hermitian.opencl
+++ b/src/kernels/level3/convert_hermitian.opencl
@@ -16,7 +16,8 @@
 R"(
 
 // =================================================================================================
-#if defined(ROUTINE_HEMM) && (PRECISION == 3232 || PRECISION == 6464)
+#if defined(ROUTINE_HEMM)
+#if PRECISION == 3232 || PRECISION == 6464
 
 // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
@@ -30,11 +31,11 @@ void HermLowerToSquared(const int src_dim,
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+  for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0);
     #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+    for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1);
       if (id_two < dest_dim && id_one < dest_dim) {
 
         // Loads data from the lower-hermitian matrix
@@ -69,11 +70,11 @@ void HermUpperToSquared(const int src_dim,
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+  for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0);
     #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+    for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1);
       if (id_two < dest_dim && id_one < dest_dim) {
 
         // Loads data from the upper-hermitian matrix
@@ -98,6 +99,7 @@ void HermUpperToSquared(const int src_dim,
 }
 
 #endif
+#endif
 // =================================================================================================
 
 // End of the C++11 raw string literal
diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl
index 8ae53b37..83ecdd65 100644
--- a/src/kernels/level3/convert_symmetric.opencl
+++ b/src/kernels/level3/convert_symmetric.opencl
@@ -30,11 +30,11 @@ void SymmLowerToSquared(const int src_dim,
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+  for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0);
     #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+    for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1);
       if (id_two < dest_dim && id_one < dest_dim) {
 
         // Loads data from the lower-symmetric matrix
@@ -63,11 +63,11 @@ void SymmUpperToSquared(const int src_dim,
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+  for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0);
     #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+    for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1);
       if (id_two < dest_dim && id_one < dest_dim) {
 
         // Loads data from the upper-symmetric matrix
diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl
index f848dcc1..a9d5e769 100644
--- a/src/kernels/level3/convert_triangular.opencl
+++ b/src/kernels/level3/convert_triangular.opencl
@@ -31,11 +31,11 @@ void TriaLowerToSquared(const int src_dim,
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+  for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0);
     #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+    for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1);
       if (id_two < dest_dim && id_one < dest_dim) {
 
         // Loads data from the lower-triangular matrix
@@ -66,11 +66,11 @@ void TriaUpperToSquared(const int src_dim,
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+  for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0);
     #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+    for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1);
       if (id_two < dest_dim && id_one < dest_dim) {
 
         // Loads data from the upper-triangular matrix
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
index 281fdcff..db1513c1 100644
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -82,14 +82,14 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   // Loads the source lower triangle into local memory. Any values in the upper triangle or
   // outside of the matrix are set to zero
   #pragma unroll
-  for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
-    const bool condition = (is_upper) ? (thread_index <= j && block_index*INTERNAL_BLOCK_SIZE + j < n) :
-                                        (thread_index >= j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
+  for (int _j = 0; _j < INTERNAL_BLOCK_SIZE; _j += 1) {
+    const bool condition = (is_upper) ? (thread_index <= _j && block_index*INTERNAL_BLOCK_SIZE + _j < n) :
+                                        (thread_index >= _j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
     if (condition) {
-      lm[thread_index][j] = src[j*src_ld + thread_index + src_block_offset];
+      lm[thread_index][_j] = src[_j*src_ld + thread_index + src_block_offset];
     }
     else {
-      SetToZero(lm[thread_index][j]);
+      SetToZero(lm[thread_index][_j]);
     }
   }
   barrier(CLK_LOCAL_MEM_FENCE);
@@ -116,7 +116,6 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
       real sum;
       if (thread_index < j) {
         SetToZero(sum);
-        #pragma unroll
         for (int k = 0; k < j; ++k) {
           MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
         }
@@ -139,7 +138,6 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
       real sum;
       if (thread_index > j) {
         SetToZero(sum);
-        #pragma unroll
         for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) {
           MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
         }
@@ -156,7 +154,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   
   // Writes the result to global memory
   #pragma unroll
-  for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+  for (int j = 0; j < INTERNAL_BLOCK_SIZE; j += 1) {
     dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j];
   }
 }
@@ -188,19 +186,17 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
   // Initializes the result registers
   real cpm[16];
   #pragma unroll
-  for (int j = 0; j < 16; ++j) {
-    SetToZero(cpm[j]);
+  for (int _j = 0; _j < 16; _j += 1) {
+    SetToZero(cpm[_j]);
   }
 
   // Computes NT x 16 block of C, each thread computes one 1 x 16 row
   for (int k = 0; k < current_size; k += 16) {
 
     // Loads a 16 x 16 block of B into local memory using NX x 4 threads
-    #pragma unroll
-    for( int i=0; i < 16; i += (size/4) ) {  // += get_local_size(0)
-      #pragma unroll
-      for( int j=0; j < 16; j += 4 ) {  // += get_local_size(1)
-        blm[(lidx + i) * LOCALX + (lidy + j)] = bgm[k + i + j*ldb];
+    for (int i = 0; i < 16; i += (size/4) ) {  // += get_local_size(0)
+      for (int _j = 0; _j < 16; _j += 4 ) {  // += get_local_size(1)
+        blm[(lidx + i) * LOCALX + (lidy + _j)] = bgm[k + i + _j*ldb];
       }
     }
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -210,11 +206,11 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
 
       // Performs 16 x 16 multiply-add operations
       #pragma unroll
-      for (int i = 0; i < 16; ++i) {
+      for (int _i = 0; _i < 16; _i += 1) {
         if (part == 2 || col++ < n) {
           #pragma unroll
-          for (int j = 0; j < 16; ++j) {
-            MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
+          for (int _j = 0; _j < 16; _j += 1) {
+            MultiplyAdd(cpm[_j], agm[(_i + k) * lda], blm[_i * LOCALX + _j]);
           }
         }
       }
@@ -226,10 +222,10 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
 
         // Performs 16 x 16 multiply-add operations
         #pragma unroll
-        for (int i = 0; i < 16; ++i) {
+        for (int _i = 0; _i < 16; _i += 1) {
           #pragma unroll
-          for (int j = 0; j < 16; ++j) {
-            MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
+          for (int _j = 0; _j < 16; _j += 1) {
+            MultiplyAdd(cpm[_j], agm[(_i + k) * lda], blm[_i * LOCALX + _j]);
           }
         }
       }
@@ -240,9 +236,9 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
 
   // Stores NT x 16 results: each thread writes one 16 x 1 row
   #pragma unroll
-  for (int i = 0; i < 16; ++i) {
-    if (part == 2) { Negate(cpm[i]); }
-    cgm[0] = cpm[i];
+  for (int _i = 0; _i < 16; _i += 1) {
+    if (part == 2) { Negate(cpm[_i]); }
+    cgm[0] = cpm[_i];
     cgm += ldc;
   }
 }
diff --git a/src/kernels/levelx/im2col.opencl b/src/kernels/levelx/im2col.opencl
index e7f69420..301e076b 100644
--- a/src/kernels/levelx/im2col.opencl
+++ b/src/kernels/levelx/im2col.opencl
@@ -41,9 +41,7 @@ void im2col(const int input_h, const int input_w, const int channels,
   const int c_id = ((int)get_global_id(1)) / output_h; // input channels
   if (h_id < output_h && w_id < output_w && c_id < channels) {
 
-    #pragma unroll
     for (int kh_id = 0; kh_id < kernel_h; ++kh_id) { // kernel height
-      #pragma unroll
       for (int kw_id = 0; kw_id < kernel_w; ++kw_id) { // kernel width
 
         // Retrieves the input value
diff --git a/test/correctness/misc/preprocessor.cpp b/test/correctness/misc/preprocessor.cpp
index 92ca2490..8b5f16f5 100644
--- a/test/correctness/misc/preprocessor.cpp
+++ b/test/correctness/misc/preprocessor.cpp
@@ -243,6 +243,14 @@ size_t RunPreprocessor(int argc, char *argv[], const bool silent, const Precisio
   ;
   if (TestKernel(device, context, "XgemmDirectTN", gemm_direct_sources, precision)) { passed++; } else { errors++; }
 
+  // HEMM
+  const auto herm_sources =
+    "#define ROUTINE_HEMM\n"
+    #include "../src/kernels/level3/level3.opencl"
+    #include "../src/kernels/level3/convert_hermitian.opencl"
+  ;
+  if (TestKernel(device, context, "HermLowerToSquared", herm_sources, precision)) { passed++; } else { errors++; }
+
   // Prints and returns the statistics
   std::cout << std::endl;
   std::cout << "    " << passed << " test(s) passed" << std::endl;
@@ -258,6 +266,7 @@ size_t RunPreprocessor(int argc, char *argv[], const bool silent, const Precisio
 int main(int argc, char *argv[]) {
   auto errors = size_t{0};
   errors += clblast::RunPreprocessor(argc, argv, false, clblast::Precision::kSingle);
+  errors += clblast::RunPreprocessor(argc, argv, true, clblast::Precision::kComplexDouble);
   if (errors > 0) { return 1; } else { return 0; }
 }