summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-12-09 20:44:21 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2017-12-09 20:44:21 +0100
commit9f02fb542ca659bf58d1efefdb334ea386ef10e8 (patch)
tree8669e215f38e8b00a4ee73a436479207f2afdbf5
parentca5dbcd2bd31fb0a0e3f6c2f81b3c0fff6250738 (diff)
Completed kernel modifications for pre-processor of all other kernels
-rw-r--r--src/kernels/level1/xamax.opencl2
-rw-r--r--src/kernels/level1/xasum.opencl2
-rw-r--r--src/kernels/level1/xcopy.opencl5
-rw-r--r--src/kernels/level1/xnrm2.opencl2
-rw-r--r--src/kernels/level1/xscal.opencl5
-rw-r--r--src/kernels/level1/xswap.opencl5
-rw-r--r--src/kernels/level2/xher.opencl24
-rw-r--r--src/kernels/level2/xher2.opencl40
-rw-r--r--src/kernels/level3/convert_hermitian.opencl20
-rw-r--r--src/kernels/level3/convert_symmetric.opencl16
-rw-r--r--src/kernels/level3/convert_triangular.opencl16
-rw-r--r--src/kernels/level3/invert_diagonal_blocks.opencl44
-rw-r--r--src/kernels/levelx/im2col.opencl2
-rw-r--r--test/correctness/misc/preprocessor.cpp9
14 files changed, 97 insertions, 95 deletions
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 2bd2f714..27add015 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -75,7 +75,6 @@ void Xamax(const int n,
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
- #pragma unroll
for (int s=WGS1/2; s>0; s=s>>1) {
if (lid < s) {
if (maxlm[lid + s] >= maxlm[lid]) {
@@ -117,7 +116,6 @@ void XamaxEpilogue(const __global singlereal* restrict maxgm,
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
- #pragma unroll
for (int s=WGS2/2; s>0; s=s>>1) {
if (lid < s) {
if (maxlm[lid + s] >= maxlm[lid]) {
diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl
index 1fc91be8..29e7fa3e 100644
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@@ -56,7 +56,6 @@ void Xasum(const int n,
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
- #pragma unroll
for (int s=WGS1/2; s>0; s=s>>1) {
if (lid < s) {
Add(lm[lid], lm[lid], lm[lid + s]);
@@ -85,7 +84,6 @@ void XasumEpilogue(const __global real* restrict input,
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
- #pragma unroll
for (int s=WGS2/2; s>0; s=s>>1) {
if (lid < s) {
Add(lm[lid], lm[lid], lm[lid + s]);
diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl
index 228e0735..aed80fc2 100644
--- a/src/kernels/level1/xcopy.opencl
+++ b/src/kernels/level1/xcopy.opencl
@@ -28,7 +28,6 @@ void Xcopy(const int n,
__global real* ygm, const int y_offset, const int y_inc) {
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
- #pragma unroll
for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
ygm[id*y_inc + y_offset] = xgm[id*x_inc + x_offset];
}
@@ -43,8 +42,8 @@ void XcopyFast(const int n,
const __global realV* restrict xgm,
__global realV* ygm) {
#pragma unroll
- for (int w=0; w<WPT; ++w) {
- const int id = w*get_global_size(0) + get_global_id(0);
+ for (int _w = 0; _w < WPT; _w += 1) {
+ const int id = _w*get_global_size(0) + get_global_id(0);
ygm[id] = xgm[id];
}
}
diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl
index f6d869cb..6a81c150 100644
--- a/src/kernels/level1/xnrm2.opencl
+++ b/src/kernels/level1/xnrm2.opencl
@@ -54,7 +54,6 @@ void Xnrm2(const int n,
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
- #pragma unroll
for (int s=WGS1/2; s>0; s=s>>1) {
if (lid < s) {
Add(lm[lid], lm[lid], lm[lid + s]);
@@ -83,7 +82,6 @@ void Xnrm2Epilogue(const __global real* restrict input,
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
- #pragma unroll
for (int s=WGS2/2; s>0; s=s>>1) {
if (lid < s) {
Add(lm[lid], lm[lid], lm[lid + s]);
diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl
index 3da9c2fd..cb133e88 100644
--- a/src/kernels/level1/xscal.opencl
+++ b/src/kernels/level1/xscal.opencl
@@ -28,7 +28,6 @@ void Xscal(const int n, const real_arg arg_alpha,
const real alpha = GetRealArg(arg_alpha);
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
- #pragma unroll
for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
real xvalue = xgm[id*x_inc + x_offset];
real result;
@@ -47,8 +46,8 @@ void XscalFast(const int n, const real_arg arg_alpha,
const real alpha = GetRealArg(arg_alpha);
#pragma unroll
- for (int w=0; w<WPT; ++w) {
- const int id = w*get_global_size(0) + get_global_id(0);
+ for (int _w = 0; _w < WPT; _w += 1) {
+ const int id = _w*get_global_size(0) + get_global_id(0);
realV xvalue = xgm[id];
realV result;
result = MultiplyVector(result, alpha, xvalue);
diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl
index 267271c0..bf5b6194 100644
--- a/src/kernels/level1/xswap.opencl
+++ b/src/kernels/level1/xswap.opencl
@@ -28,7 +28,6 @@ void Xswap(const int n,
__global real* ygm, const int y_offset, const int y_inc) {
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
- #pragma unroll
for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
real temp = xgm[id*x_inc + x_offset];
xgm[id*x_inc + x_offset] = ygm[id*y_inc + y_offset];
@@ -45,8 +44,8 @@ void XswapFast(const int n,
__global realV* xgm,
__global realV* ygm) {
#pragma unroll
- for (int w=0; w<WPT; ++w) {
- const int id = w*get_global_size(0) + get_global_id(0);
+ for (int _w = 0; _w < WPT; _w += 1) {
+ const int id = _w*get_global_size(0) + get_global_id(0);
realV temp = xgm[id];
xgm[id] = ygm[id];
ygm[id] = temp;
diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl
index b0772218..8a57bdfc 100644
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@@ -27,32 +27,34 @@ void Xher(const int n,
const real alpha = GetRealArg(arg_alpha);
// Register storage for X and XT
+ #pragma promote_to_registers
real xvalues[WPT];
+ #pragma promote_to_registers
real xtvalues[WPT];
// Loads the X-vector
#pragma unroll
- for (int w=0; w<WPT; ++w) {
- const int id2 = w*get_global_size(1) + get_global_id(1);
- xvalues[w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
+ for (int _w = 0; _w < WPT; _w += 1) {
+ const int id2 = _w*get_global_size(1) + get_global_id(1);
+ xvalues[_w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
}
// Loads the X-transposed-vector
#pragma unroll
- for (int w=0; w<WPT; ++w) {
- const int id1 = w*get_global_size(0) + get_global_id(0);
- xtvalues[w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
+ for (int _w = 0; _w < WPT; _w += 1) {
+ const int id1 = _w*get_global_size(0) + get_global_id(0);
+ xtvalues[_w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
}
// Loops over the work per thread twice
#pragma unroll
- for (int w1=0; w1<WPT; ++w1) {
+ for (int _w1 = 0; _w1 < WPT; _w1 += 1) {
#pragma unroll
- for (int w2=0; w2<WPT; ++w2) {
+ for (int _w2 = 0; _w2 < WPT; _w2 += 1) {
// Global thread IDs
- const int id1 = w1*get_global_size(0) + get_global_id(0);
- const int id2 = w2*get_global_size(1) + get_global_id(1);
+ const int id1 = _w1*get_global_size(0) + get_global_id(0);
+ const int id2 = _w2*get_global_size(1) + get_global_id(1);
// Skip these threads if they do not contain threads contributing to the matrix-triangle
if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) {
@@ -61,7 +63,7 @@ void Xher(const int n,
// Loads A, performs the operation, and stores the result into A
else {
- MatrixUpdate(id1, id2, n, n, agm, a_offset, a_ld, alpha, xvalues[w2], xtvalues[w1], is_upper);
+ MatrixUpdate(id1, id2, n, n, agm, a_offset, a_ld, alpha, xvalues[_w2], xtvalues[_w1], is_upper);
}
}
}
diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl
index 00a756c9..73305149 100644
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@@ -28,37 +28,41 @@ void Xher2(const int n,
const real alpha = GetRealArg(arg_alpha);
// Register storage for X and Y
+ #pragma promote_to_registers
real xvalues[WPT];
+ #pragma promote_to_registers
real yvalues[WPT];
+ #pragma promote_to_registers
real xtvalues[WPT];
+ #pragma promote_to_registers
real ytvalues[WPT];
// Loads the X-vector
#pragma unroll
- for (int w=0; w<WPT; ++w) {
- const int id2 = w*get_global_size(1) + get_global_id(1);
- xvalues[w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
+ for (int _w = 0; _w < WPT; _w += 1) {
+ const int id2 = _w*get_global_size(1) + get_global_id(1);
+ xvalues[_w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
}
// Loads the X-transposed-vector
#pragma unroll
- for (int w=0; w<WPT; ++w) {
- const int id1 = w*get_global_size(0) + get_global_id(0);
- xtvalues[w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
+ for (int _w = 0; _w < WPT; _w += 1) {
+ const int id1 = _w*get_global_size(0) + get_global_id(0);
+ xtvalues[_w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
}
// Loads the Y-vector
#pragma unroll
- for (int w=0; w<WPT; ++w) {
- const int id1 = w*get_global_size(0) + get_global_id(0);
- yvalues[w] = LoadVector(id1, n, ygm, y_offset, y_inc, is_rowmajor);
+ for (int _w = 0; _w < WPT; _w += 1) {
+ const int id1 = _w*get_global_size(0) + get_global_id(0);
+ yvalues[_w] = LoadVector(id1, n, ygm, y_offset, y_inc, is_rowmajor);
}
// Loads the Y-transposed-vector
#pragma unroll
- for (int w=0; w<WPT; ++w) {
- const int id2 = w*get_global_size(1) + get_global_id(1);
- ytvalues[w] = LoadVector(id2, n, ygm, y_offset, y_inc, !is_rowmajor);
+ for (int _w = 0; _w < WPT; _w += 1) {
+ const int id2 = _w*get_global_size(1) + get_global_id(1);
+ ytvalues[_w] = LoadVector(id2, n, ygm, y_offset, y_inc, !is_rowmajor);
}
// Sets the proper value of alpha in case conjugation is needed
@@ -75,13 +79,13 @@ void Xher2(const int n,
// Loops over the work per thread twice
#pragma unroll
- for (int w1=0; w1<WPT; ++w1) {
+ for (int _w1 = 0; _w1 < WPT; _w1 += 1) {
#pragma unroll
- for (int w2=0; w2<WPT; ++w2) {
+ for (int _w2 = 0; _w2 < WPT; _w2 += 1) {
// Global thread IDs
- const int id1 = w1*get_global_size(0) + get_global_id(0);
- const int id2 = w2*get_global_size(1) + get_global_id(1);
+ const int id1 = _w1*get_global_size(0) + get_global_id(0);
+ const int id2 = _w2*get_global_size(1) + get_global_id(1);
// Skip these threads if they do not contain threads contributing to the matrix-triangle
if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) {
@@ -91,8 +95,8 @@ void Xher2(const int n,
// Loads A, performs the operation, and stores the result into A
else {
MatrixUpdate2(id1, id2, n, n, agm, a_offset, a_ld,
- alpha1, xvalues[w2], yvalues[w1],
- alpha2, xtvalues[w1], ytvalues[w2], is_upper);
+ alpha1, xvalues[_w2], yvalues[_w1],
+ alpha2, xtvalues[_w1], ytvalues[_w2], is_upper);
}
}
}
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl
index ed2ded98..0e89b78b 100644
--- a/src/kernels/level3/convert_hermitian.opencl
+++ b/src/kernels/level3/convert_hermitian.opencl
@@ -16,7 +16,8 @@
R"(
// =================================================================================================
-#if defined(ROUTINE_HEMM) && (PRECISION == 3232 || PRECISION == 6464)
+#if defined(ROUTINE_HEMM)
+#if PRECISION == 3232 || PRECISION == 6464
// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
@@ -30,11 +31,11 @@ void HermLowerToSquared(const int src_dim,
// Loops over the work per thread in both dimensions
#pragma unroll
- for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
- const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+ for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) {
+ const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0);
#pragma unroll
- for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
- const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+ for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) {
+ const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1);
if (id_two < dest_dim && id_one < dest_dim) {
// Loads data from the lower-hermitian matrix
@@ -69,11 +70,11 @@ void HermUpperToSquared(const int src_dim,
// Loops over the work per thread in both dimensions
#pragma unroll
- for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
- const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+ for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) {
+ const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0);
#pragma unroll
- for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
- const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+ for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) {
+ const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1);
if (id_two < dest_dim && id_one < dest_dim) {
// Loads data from the upper-hermitian matrix
@@ -98,6 +99,7 @@ void HermUpperToSquared(const int src_dim,
}
#endif
+#endif
// =================================================================================================
// End of the C++11 raw string literal
diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl
index 8ae53b37..83ecdd65 100644
--- a/src/kernels/level3/convert_symmetric.opencl
+++ b/src/kernels/level3/convert_symmetric.opencl
@@ -30,11 +30,11 @@ void SymmLowerToSquared(const int src_dim,
// Loops over the work per thread in both dimensions
#pragma unroll
- for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
- const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+ for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) {
+ const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0);
#pragma unroll
- for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
- const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+ for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) {
+ const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1);
if (id_two < dest_dim && id_one < dest_dim) {
// Loads data from the lower-symmetric matrix
@@ -63,11 +63,11 @@ void SymmUpperToSquared(const int src_dim,
// Loops over the work per thread in both dimensions
#pragma unroll
- for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
- const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+ for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) {
+ const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0);
#pragma unroll
- for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
- const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+ for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) {
+ const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1);
if (id_two < dest_dim && id_one < dest_dim) {
// Loads data from the upper-symmetric matrix
diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl
index f848dcc1..a9d5e769 100644
--- a/src/kernels/level3/convert_triangular.opencl
+++ b/src/kernels/level3/convert_triangular.opencl
@@ -31,11 +31,11 @@ void TriaLowerToSquared(const int src_dim,
// Loops over the work per thread in both dimensions
#pragma unroll
- for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
- const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+ for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) {
+ const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0);
#pragma unroll
- for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
- const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+ for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) {
+ const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1);
if (id_two < dest_dim && id_one < dest_dim) {
// Loads data from the lower-triangular matrix
@@ -66,11 +66,11 @@ void TriaUpperToSquared(const int src_dim,
// Loops over the work per thread in both dimensions
#pragma unroll
- for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
- const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+ for (int _w_one = 0; _w_one < PAD_WPTX; _w_one += 1) {
+ const int id_one = (get_group_id(0)*PAD_WPTX + _w_one) * PAD_DIMX + get_local_id(0);
#pragma unroll
- for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
- const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+ for (int _w_two = 0; _w_two < PAD_WPTY; _w_two += 1) {
+ const int id_two = (get_group_id(1)*PAD_WPTY + _w_two) * PAD_DIMY + get_local_id(1);
if (id_two < dest_dim && id_one < dest_dim) {
// Loads data from the upper-triangular matrix
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
index 281fdcff..db1513c1 100644
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -82,14 +82,14 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
// Loads the source lower triangle into local memory. Any values in the upper triangle or
// outside of the matrix are set to zero
#pragma unroll
- for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
- const bool condition = (is_upper) ? (thread_index <= j && block_index*INTERNAL_BLOCK_SIZE + j < n) :
- (thread_index >= j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
+ for (int _j = 0; _j < INTERNAL_BLOCK_SIZE; _j += 1) {
+ const bool condition = (is_upper) ? (thread_index <= _j && block_index*INTERNAL_BLOCK_SIZE + _j < n) :
+ (thread_index >= _j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
if (condition) {
- lm[thread_index][j] = src[j*src_ld + thread_index + src_block_offset];
+ lm[thread_index][_j] = src[_j*src_ld + thread_index + src_block_offset];
}
else {
- SetToZero(lm[thread_index][j]);
+ SetToZero(lm[thread_index][_j]);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -116,7 +116,6 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
real sum;
if (thread_index < j) {
SetToZero(sum);
- #pragma unroll
for (int k = 0; k < j; ++k) {
MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
}
@@ -139,7 +138,6 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
real sum;
if (thread_index > j) {
SetToZero(sum);
- #pragma unroll
for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) {
MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
}
@@ -156,7 +154,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
// Writes the result to global memory
#pragma unroll
- for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+ for (int j = 0; j < INTERNAL_BLOCK_SIZE; j += 1) {
dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j];
}
}
@@ -188,19 +186,17 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
// Initializes the result registers
real cpm[16];
#pragma unroll
- for (int j = 0; j < 16; ++j) {
- SetToZero(cpm[j]);
+ for (int _j = 0; _j < 16; _j += 1) {
+ SetToZero(cpm[_j]);
}
// Computes NT x 16 block of C, each thread computes one 1 x 16 row
for (int k = 0; k < current_size; k += 16) {
// Loads a 16 x 16 block of B into local memory using NX x 4 threads
- #pragma unroll
- for( int i=0; i < 16; i += (size/4) ) { // += get_local_size(0)
- #pragma unroll
- for( int j=0; j < 16; j += 4 ) { // += get_local_size(1)
- blm[(lidx + i) * LOCALX + (lidy + j)] = bgm[k + i + j*ldb];
+ for (int i = 0; i < 16; i += (size/4) ) { // += get_local_size(0)
+ for (int _j = 0; _j < 16; _j += 4 ) { // += get_local_size(1)
+ blm[(lidx + i) * LOCALX + (lidy + _j)] = bgm[k + i + _j*ldb];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -210,11 +206,11 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
// Performs 16 x 16 multiply-add operations
#pragma unroll
- for (int i = 0; i < 16; ++i) {
+ for (int _i = 0; _i < 16; _i += 1) {
if (part == 2 || col++ < n) {
#pragma unroll
- for (int j = 0; j < 16; ++j) {
- MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
+ for (int _j = 0; _j < 16; _j += 1) {
+ MultiplyAdd(cpm[_j], agm[(_i + k) * lda], blm[_i * LOCALX + _j]);
}
}
}
@@ -226,10 +222,10 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
// Performs 16 x 16 multiply-add operations
#pragma unroll
- for (int i = 0; i < 16; ++i) {
+ for (int _i = 0; _i < 16; _i += 1) {
#pragma unroll
- for (int j = 0; j < 16; ++j) {
- MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
+ for (int _j = 0; _j < 16; _j += 1) {
+ MultiplyAdd(cpm[_j], agm[(_i + k) * lda], blm[_i * LOCALX + _j]);
}
}
}
@@ -240,9 +236,9 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
// Stores NT x 16 results: each thread writes one 16 x 1 row
#pragma unroll
- for (int i = 0; i < 16; ++i) {
- if (part == 2) { Negate(cpm[i]); }
- cgm[0] = cpm[i];
+ for (int _i = 0; _i < 16; _i += 1) {
+ if (part == 2) { Negate(cpm[_i]); }
+ cgm[0] = cpm[_i];
cgm += ldc;
}
}
diff --git a/src/kernels/levelx/im2col.opencl b/src/kernels/levelx/im2col.opencl
index e7f69420..301e076b 100644
--- a/src/kernels/levelx/im2col.opencl
+++ b/src/kernels/levelx/im2col.opencl
@@ -41,9 +41,7 @@ void im2col(const int input_h, const int input_w, const int channels,
const int c_id = ((int)get_global_id(1)) / output_h; // input channels
if (h_id < output_h && w_id < output_w && c_id < channels) {
- #pragma unroll
for (int kh_id = 0; kh_id < kernel_h; ++kh_id) { // kernel height
- #pragma unroll
for (int kw_id = 0; kw_id < kernel_w; ++kw_id) { // kernel width
// Retrieves the input value
diff --git a/test/correctness/misc/preprocessor.cpp b/test/correctness/misc/preprocessor.cpp
index 92ca2490..8b5f16f5 100644
--- a/test/correctness/misc/preprocessor.cpp
+++ b/test/correctness/misc/preprocessor.cpp
@@ -243,6 +243,14 @@ size_t RunPreprocessor(int argc, char *argv[], const bool silent, const Precisio
;
if (TestKernel(device, context, "XgemmDirectTN", gemm_direct_sources, precision)) { passed++; } else { errors++; }
+ // HEMM
+ const auto herm_sources =
+ "#define ROUTINE_HEMM\n"
+ #include "../src/kernels/level3/level3.opencl"
+ #include "../src/kernels/level3/convert_hermitian.opencl"
+ ;
+ if (TestKernel(device, context, "HermLowerToSquared", herm_sources, precision)) { passed++; } else { errors++; }
+
// Prints and returns the statistics
std::cout << std::endl;
std::cout << " " << passed << " test(s) passed" << std::endl;
@@ -258,6 +266,7 @@ size_t RunPreprocessor(int argc, char *argv[], const bool silent, const Precisio
int main(int argc, char *argv[]) {
auto errors = size_t{0};
errors += clblast::RunPreprocessor(argc, argv, false, clblast::Precision::kSingle);
+ errors += clblast::RunPreprocessor(argc, argv, true, clblast::Precision::kComplexDouble);
if (errors > 0) { return 1; } else { return 0; }
}